diff --git a/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs b/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs new file mode 100644 index 00000000..7e4a1af6 --- /dev/null +++ b/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs @@ -0,0 +1,167 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.PunctuationAnalysis; + +namespace SIL.Machine.Corpora +{ + public class FallbackQuotationMarkResolver : IQuotationMarkResolver + { + private readonly IQuotationMarkResolutionSettings _settings; + public QuotationMarkMetadata LastQuotationMark { get; set; } + public HashSet Issues { get; } + + public FallbackQuotationMarkResolver(IQuotationMarkResolutionSettings settings) + { + _settings = settings; + LastQuotationMark = null; + Issues = new HashSet(); + } + + public void Reset() + { + LastQuotationMark = null; + Issues.Clear(); + } + + public IEnumerable ResolveQuotationMarks( + IReadOnlyList quotationMarkMatches + ) + { + foreach (QuotationMarkStringMatch quoteMatch in quotationMarkMatches) + { + foreach (QuotationMarkMetadata quotationMarkMetadata in ResolveQuotationMark(quoteMatch)) + { + yield return quotationMarkMetadata; + } + } + } + + public IEnumerable ResolveQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + if (IsOpeningQuotationMark(quotationMarkMatch)) + { + QuotationMarkMetadata quotationMark = ResolveOpeningMark(quotationMarkMatch); + if (quotationMark != null) + { + yield return quotationMark; + } + else + { + Issues.Add(QuotationMarkResolutionIssue.UnexpectedQuotationMark); + } + } + else if (IsClosingQuotationMark(quotationMarkMatch)) + { + QuotationMarkMetadata quotationMark = ResolveClosingMark(quotationMarkMatch); + if (quotationMark != null) + { + yield return quotationMark; + } + else + { + Issues.Add(QuotationMarkResolutionIssue.UnexpectedQuotationMark); + } + } + else + { + // Make a reasonable guess about the direction of the quotation mark + if (LastQuotationMark == null || LastQuotationMark.Direction == QuotationMarkDirection.Closing) + { + QuotationMarkMetadata quotationMark = ResolveOpeningMark(quotationMarkMatch); + if (quotationMark != null) + yield return quotationMark; + } + else + { + QuotationMarkMetadata quotationMark = ResolveClosingMark(quotationMarkMatch); + if (quotationMark != null) + yield return quotationMark; + } + Issues.Add(QuotationMarkResolutionIssue.AmbiguousQuotationMark); + } + } + + public bool IsOpeningQuotationMark(QuotationMarkStringMatch match) + { + if (_settings.IsValidOpeningQuotationMark(match) && _settings.IsValidClosingQuotationMark(match)) + { + return ( + match.IsAtStartOfSegment + || match.HasLeadingWhitespace() + || DoesMostRecentOpeningMarkImmediatelyPrecede(match) + || match.HasQuoteIntroducerInLeadingSubstring() + ) && !(match.HasTrailingWhitespace() || match.HasTrailingPunctuation()); + } + else if (_settings.IsValidOpeningQuotationMark(match)) + { + return true; + } + + return false; + } + + public bool DoesMostRecentOpeningMarkImmediatelyPrecede(QuotationMarkStringMatch match) + { + if (LastQuotationMark == null || LastQuotationMark.Direction != QuotationMarkDirection.Opening) + { + return false; + } + return LastQuotationMark.TextSegment.Equals(match.TextSegment) + && LastQuotationMark.EndIndex == match.StartIndex; + } + + public bool IsClosingQuotationMark(QuotationMarkStringMatch match) + { + if (_settings.IsValidOpeningQuotationMark(match) && _settings.IsValidClosingQuotationMark(match)) + { + return (match.HasTrailingWhitespace() || match.HasTrailingPunctuation() || match.IsAtEndOfSegment) + && !match.HasLeadingWhitespace(); + } + else if (_settings.IsValidClosingQuotationMark(match)) + { + return true; + } + + return false; + } + + public QuotationMarkMetadata ResolveOpeningMark(QuotationMarkStringMatch quotationMarkMatch) + { + HashSet possibleDepths = _settings.GetPossibleDepths( + quotationMarkMatch.QuotationMark, + QuotationMarkDirection.Opening + ); + if (possibleDepths.Count == 0) + return null; + + QuotationMarkMetadata quotationMark = quotationMarkMatch.Resolve( + possibleDepths.Min(), + QuotationMarkDirection.Opening + ); + LastQuotationMark = quotationMark; + return quotationMark; + } + + public QuotationMarkMetadata ResolveClosingMark(QuotationMarkStringMatch quotationMarkMatch) + { + HashSet possibleDepths = _settings.GetPossibleDepths( + quotationMarkMatch.QuotationMark, + QuotationMarkDirection.Closing + ); + if (possibleDepths.Count == 0) + return null; + + QuotationMarkMetadata quote = quotationMarkMatch.Resolve( + possibleDepths.Min(), + QuotationMarkDirection.Closing + ); + LastQuotationMark = quote; + return quote; + } + + public HashSet GetIssues() + { + return Issues; + } + } +} diff --git a/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs b/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs new file mode 100644 index 00000000..db2c6a92 --- /dev/null +++ b/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs @@ -0,0 +1,56 @@ +using System; +using System.IO; +using System.Text; +using SIL.Machine.PunctuationAnalysis; + +namespace SIL.Machine.Corpora +{ + public abstract class ParatextProjectQuoteConventionDetector + { + private readonly ParatextProjectSettings _settings; + + protected ParatextProjectQuoteConventionDetector(ParatextProjectSettings settings) + { + _settings = settings; + } + + protected ParatextProjectQuoteConventionDetector(ParatextProjectSettingsParserBase settingsParser) + { + _settings = settingsParser.Parse(); + } + + public QuoteConventionAnalysis GetQuoteConventionAnalysis(QuoteConventionDetector handler = null) + { + handler = handler ?? new QuoteConventionDetector(); + foreach (string fileName in _settings.GetAllScriptureBookFileNames()) + { + if (!Exists(fileName)) + continue; + + string usfm; + using (var reader = new StreamReader(Open(fileName))) + { + usfm = reader.ReadToEnd(); + } + + try + { + UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification); + } + catch (Exception ex) + { + var sb = new StringBuilder(); + sb.Append($"An error occurred while parsing the usfm for '{fileName}`"); + if (!string.IsNullOrEmpty(_settings.Name)) + sb.Append($" in project '{_settings.Name}'"); + sb.Append($". Error: '{ex.Message}'"); + throw new InvalidOperationException(sb.ToString(), ex); + } + } + return handler.DetectQuotationConvention(); + } + + protected abstract bool Exists(string fileName); + protected abstract Stream Open(string fileName); + } +} diff --git a/src/SIL.Machine/Corpora/ParatextProjectSettings.cs b/src/SIL.Machine/Corpora/ParatextProjectSettings.cs index 6781a8ad..286c6c27 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectSettings.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectSettings.cs @@ -1,4 +1,5 @@ -using System.Globalization; +using System.Collections.Generic; +using System.Globalization; using System.Text; using SIL.Scripture; @@ -103,6 +104,16 @@ public string GetBookFileName(string bookId) return FileNamePrefix + bookPart + FileNameSuffix; } + public IEnumerable GetAllScriptureBookFileNames() + { + BookSet scriptureBooks = Canon.ScriptureBooks; + scriptureBooks.SelectAll(); + foreach (string bookId in scriptureBooks.SelectedBookIds) + { + yield return GetBookFileName(bookId); + } + } + private static string GetBookFileNameDigits(string bookId) { int bookNum = Canon.BookIdToNumber(bookId); diff --git a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs index 02cf07e3..65273298 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs @@ -21,7 +21,7 @@ protected ParatextProjectTextUpdaterBase(ParatextProjectSettingsParserBase setti public string UpdateUsfm( string bookId, - IReadOnlyList<(IReadOnlyList, string)> rows, + IReadOnlyList rows, string fullName = null, UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting, UpdateUsfmMarkerBehavior paragraphBehavior = UpdateUsfmMarkerBehavior.Preserve, diff --git a/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs index 93446437..485cb1fc 100644 --- a/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs @@ -8,49 +8,60 @@ namespace SIL.Machine.Corpora { public class PlaceMarkersAlignmentInfo { - public IReadOnlyList Refs { get; } + public const string MetadataKey = "alignment_info"; + public IReadOnlyList SourceTokens { get; } public IReadOnlyList TranslationTokens { get; } public WordAlignmentMatrix Alignment { get; } + public UpdateUsfmMarkerBehavior ParagraphBehavior { get; } + public UpdateUsfmMarkerBehavior StyleBehavior { get; } public PlaceMarkersAlignmentInfo( - IReadOnlyList refs, IReadOnlyList sourceTokens, IReadOnlyList translationTokens, - WordAlignmentMatrix alignment + WordAlignmentMatrix alignment, + UpdateUsfmMarkerBehavior paragraphBehavior, + UpdateUsfmMarkerBehavior styleBehavior ) { - Refs = refs; SourceTokens = sourceTokens; TranslationTokens = translationTokens; Alignment = alignment; + ParagraphBehavior = paragraphBehavior; + StyleBehavior = styleBehavior; } } public class PlaceMarkersUsfmUpdateBlockHandler : IUsfmUpdateBlockHandler { - private readonly IDictionary _alignmentInfo; - - public PlaceMarkersUsfmUpdateBlockHandler(IEnumerable alignmentInfo) - { - _alignmentInfo = alignmentInfo.ToDictionary(info => info.Refs.First(), info => info); - } - public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) { string reference = block.Refs.FirstOrDefault().ToString(); var elements = block.Elements.ToList(); // Nothing to do if there are no markers to place or no alignment to use + if (!block.Metadata.TryGetValue(PlaceMarkersAlignmentInfo.MetadataKey, out object alignmentObject)) + { + return block; + } + if (!(alignmentObject is PlaceMarkersAlignmentInfo alignmentInfo)) + { + return block; + } if ( elements.Count == 0 - || !_alignmentInfo.TryGetValue(reference, out PlaceMarkersAlignmentInfo alignmentInfo) || alignmentInfo.Alignment.RowCount == 0 || alignmentInfo.Alignment.ColumnCount == 0 || !elements.Any(e => - e.Type.IsOneOf(UsfmUpdateBlockElementType.Paragraph, UsfmUpdateBlockElementType.Style) - && !e.MarkedForRemoval - && e.Tokens.Count == 1 + ( + e.Type == UsfmUpdateBlockElementType.Paragraph + && alignmentInfo.ParagraphBehavior == UpdateUsfmMarkerBehavior.Preserve + && e.Tokens.Count == 1 + ) + || ( + e.Type == UsfmUpdateBlockElementType.Style + && alignmentInfo.StyleBehavior == UpdateUsfmMarkerBehavior.Preserve + ) ) ) { @@ -112,7 +123,13 @@ public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) { if (element.Type == UsfmUpdateBlockElementType.Text) { - if (element.MarkedForRemoval) + if ( + element.MarkedForRemoval + || ( + element.Type == UsfmUpdateBlockElementType.Paragraph + && alignmentInfo.ParagraphBehavior == UpdateUsfmMarkerBehavior.Strip + ) + ) { string text = element.Tokens[0].ToUsfm(); sourceSentence += text; diff --git a/src/SIL.Machine/Corpora/QuotationMarkDenormalizationFirstPass.cs b/src/SIL.Machine/Corpora/QuotationMarkDenormalizationFirstPass.cs new file mode 100644 index 00000000..c90827d5 --- /dev/null +++ b/src/SIL.Machine/Corpora/QuotationMarkDenormalizationFirstPass.cs @@ -0,0 +1,14 @@ +using SIL.Machine.PunctuationAnalysis; + +namespace SIL.Machine.Corpora +{ + // This is a convenience class so that users don't have to know to normalize the source quote convention + public class QuotationMarkDenormalizationFirstPass : QuotationMarkUpdateFirstPass + { + public QuotationMarkDenormalizationFirstPass( + QuoteConvention sourceQuoteConvention, + QuoteConvention targetQuoteConvention + ) + : base(sourceQuoteConvention.Normalize(), targetQuoteConvention) { } + } +} diff --git a/src/SIL.Machine/Corpora/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs new file mode 100644 index 00000000..f5ac923f --- /dev/null +++ b/src/SIL.Machine/Corpora/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs @@ -0,0 +1,19 @@ +using SIL.Machine.PunctuationAnalysis; + +namespace SIL.Machine.Corpora +{ + public class QuotationMarkDenormalizationUsfmUpdateBlockHandler : QuoteConventionChangingUsfmUpdateBlockHandler + { + // This is a convenience class so that users don't have to know to normalize the source quote convention + public QuotationMarkDenormalizationUsfmUpdateBlockHandler( + QuoteConvention sourceQuoteConvention, + QuoteConvention targetQuoteConvention, + QuotationMarkUpdateSettings settings = null + ) + : base( + sourceQuoteConvention.Normalize(), + targetQuoteConvention, + settings ?? new QuotationMarkUpdateSettings() + ) { } + } +} diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs new file mode 100644 index 00000000..f5106501 --- /dev/null +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs @@ -0,0 +1,101 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.PunctuationAnalysis; + +namespace SIL.Machine.Corpora +{ + // Determines the best strategy to take for each chapter + public class QuotationMarkUpdateFirstPass : UsfmStructureExtractor + { + private readonly QuotationMarkFinder _quotationMarkFinder; + private readonly DepthBasedQuotationMarkResolver _quotationMarkResolver; + public bool WillFallbackModeWork { get; set; } + + public QuotationMarkUpdateFirstPass( + QuoteConvention sourceQuoteConvention, + QuoteConvention targetQuoteConvention + ) + { + _quotationMarkFinder = new QuotationMarkFinder( + new QuoteConventionSet(new List { sourceQuoteConvention, targetQuoteConvention }) + ); + _quotationMarkResolver = new DepthBasedQuotationMarkResolver( + new QuotationMarkUpdateResolutionSettings(sourceQuoteConvention) + ); + WillFallbackModeWork = CheckWhetherFallbackModeWillWork(sourceQuoteConvention, targetQuoteConvention); + } + + public bool CheckWhetherFallbackModeWillWork( + QuoteConvention sourceQuoteConvention, + QuoteConvention targetQuoteConvention + ) + { + var targetMarkBySourceMark = new Dictionary(); + foreach ( + int depth in Enumerable.Range( + 1, + Math.Min(sourceQuoteConvention.NumLevels, targetQuoteConvention.NumLevels) + ) + ) + { + string openingQuotationMark = sourceQuoteConvention.GetOpeningQuotationMarkAtDepth(depth); + string closingQuotationMark = targetQuoteConvention.GetClosingQuotationMarkAtDepth(depth); + if ( + targetMarkBySourceMark.TryGetValue( + openingQuotationMark, + out string correspondingClosingQuotationMark + ) + && correspondingClosingQuotationMark != closingQuotationMark + ) + { + return false; + } + targetMarkBySourceMark[openingQuotationMark] = closingQuotationMark; + } + return true; + } + + public List FindBestChapterStrategies() + { + var bestActionsByChapter = new List(); + foreach (Chapter chapter in GetChapters()) + { + bestActionsByChapter.Add(FindBestStrategyForChapter(chapter)); + } + return bestActionsByChapter; + } + + public QuotationMarkUpdateStrategy FindBestStrategyForChapter(Chapter chapter) + { + List quotationMarkMatches = + _quotationMarkFinder.FindAllPotentialQuotationMarksInChapter(chapter); + + _quotationMarkResolver.Reset(); + + // Use ToList() to force evaluation of the generator + _quotationMarkResolver.ResolveQuotationMarks(quotationMarkMatches).ToList(); + + return ChooseBestStrategyBasedOnObservedIssues(_quotationMarkResolver.GetIssues()); + } + + public QuotationMarkUpdateStrategy ChooseBestStrategyBasedOnObservedIssues( + HashSet issues + ) + { + if (issues.Contains(QuotationMarkResolutionIssue.AmbiguousQuotationMark)) + return QuotationMarkUpdateStrategy.Skip; + + if ( + issues.Contains(QuotationMarkResolutionIssue.UnpairedQuotationMark) + || issues.Contains(QuotationMarkResolutionIssue.TooDeepNesting) + ) + { + if (WillFallbackModeWork) + return QuotationMarkUpdateStrategy.ApplyFallback; + return QuotationMarkUpdateStrategy.Skip; + } + return QuotationMarkUpdateStrategy.ApplyFull; + } + } +} diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs new file mode 100644 index 00000000..7791d048 --- /dev/null +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs @@ -0,0 +1,57 @@ +using System.Collections.Generic; +using System.Text.RegularExpressions; + +namespace SIL.Machine.PunctuationAnalysis +{ + public class QuotationMarkUpdateResolutionSettings : IQuotationMarkResolutionSettings + { + private readonly QuoteConvention _sourceQuoteConvention; + private readonly QuoteConventionSet _quoteConventionSingletonSet; + + public QuotationMarkUpdateResolutionSettings(QuoteConvention sourceQuoteConvention) + { + _sourceQuoteConvention = sourceQuoteConvention; + _quoteConventionSingletonSet = new QuoteConventionSet(new List { sourceQuoteConvention }); + } + + public bool AreMarksAValidPair(string openingMark, string closingMark) + { + return _quoteConventionSingletonSet.MarksAreAValidPair(openingMark, closingMark); + } + + public Regex GetClosingQuotationMarkRegex() + { + return _quoteConventionSingletonSet.ClosingQuotationMarkRegex; + } + + public Regex GetOpeningQuotationMarkRegex() + { + return _quoteConventionSingletonSet.OpeningQuotationMarkRegex; + } + + public HashSet GetPossibleDepths(string quotationMark, QuotationMarkDirection direction) + { + return _sourceQuoteConvention.GetPossibleDepths(quotationMark, direction); + } + + public bool IsValidClosingQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + return quotationMarkMatch.IsValidClosingQuotationMark(_quoteConventionSingletonSet); + } + + public bool IsValidOpeningQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + return quotationMarkMatch.IsValidOpeningQuotationMark(_quoteConventionSingletonSet); + } + + public bool MetadataMatchesQuotationMark(string quotationMark, int depth, QuotationMarkDirection direction) + { + return _sourceQuoteConvention.GetExpectedQuotationMark(depth, direction) == quotationMark; + } + + public bool ShouldRelyOnParagraphMarkers() + { + return false; + } + } +} diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateSettings.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateSettings.cs new file mode 100644 index 00000000..fc8b50fb --- /dev/null +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateSettings.cs @@ -0,0 +1,28 @@ +using System.Collections.Generic; + +namespace SIL.Machine.Corpora +{ + public class QuotationMarkUpdateSettings + { + private readonly QuotationMarkUpdateStrategy _defaultChapterAction; + private readonly List _chapterActions; + + public QuotationMarkUpdateSettings( + QuotationMarkUpdateStrategy defaultChapterStrategy = QuotationMarkUpdateStrategy.ApplyFull, + List chapterStrategies = null + ) + { + _defaultChapterAction = defaultChapterStrategy; + _chapterActions = chapterStrategies ?? new List(); + } + + public QuotationMarkUpdateStrategy GetActionForChapter(int chapterNumber) + { + if (chapterNumber <= _chapterActions.Count) + { + return _chapterActions[chapterNumber - 1]; + } + return _defaultChapterAction; + } + } +} diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateStrategy.cs b/src/SIL.Machine/Corpora/QuotationMarkUpdateStrategy.cs new file mode 100644 index 00000000..e6ae10b0 --- /dev/null +++ b/src/SIL.Machine/Corpora/QuotationMarkUpdateStrategy.cs @@ -0,0 +1,9 @@ +namespace SIL.Machine.Corpora +{ + public enum QuotationMarkUpdateStrategy + { + ApplyFull, + ApplyFallback, + Skip + } +} diff --git a/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs new file mode 100644 index 00000000..0817854d --- /dev/null +++ b/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs @@ -0,0 +1,229 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.PunctuationAnalysis; + +namespace SIL.Machine.Corpora +{ + public class QuoteConventionChangingUsfmUpdateBlockHandler : IUsfmUpdateBlockHandler + { + private readonly QuoteConvention _sourceQuoteConvention; + private readonly QuoteConvention _targetQuoteConvention; + private readonly QuotationMarkUpdateSettings _settings; + protected QuotationMarkFinder QuotationMarkFinder { get; set; } + protected TextSegment.Builder NextScriptureTextSegmentBuilder { get; set; } + protected IQuotationMarkResolver VerseTextQuotationMarkResolver { get; set; } + private readonly IQuotationMarkResolver _embedQuotationMarkResolver; + private readonly IQuotationMarkResolver _simpleQuotationMarkResolver; + protected QuotationMarkUpdateStrategy CurrentStrategy { get; set; } + protected int CurrentChapterNumber { get; set; } + private int _currentVerseNumber; + + public QuoteConventionChangingUsfmUpdateBlockHandler( + QuoteConvention sourceQuoteConvention, + QuoteConvention targetQuoteConvention, + QuotationMarkUpdateSettings settings + ) + { + _sourceQuoteConvention = sourceQuoteConvention; + _targetQuoteConvention = targetQuoteConvention; + _settings = settings; + + QuotationMarkFinder = new QuotationMarkFinder( + new QuoteConventionSet(new List { _sourceQuoteConvention }) + ); + + NextScriptureTextSegmentBuilder = new TextSegment.Builder(); + + IQuotationMarkResolutionSettings resolutionSettings = new QuotationMarkUpdateResolutionSettings( + sourceQuoteConvention + ); + + // Each embed represents a separate context for quotation marks + // (i.e. you can't open a quote in one context and close it in another) + // so we need to keep track of the verse and embed contexts separately. + VerseTextQuotationMarkResolver = new DepthBasedQuotationMarkResolver(resolutionSettings); + _embedQuotationMarkResolver = new DepthBasedQuotationMarkResolver(resolutionSettings); + _simpleQuotationMarkResolver = new FallbackQuotationMarkResolver(resolutionSettings); + + CurrentStrategy = QuotationMarkUpdateStrategy.ApplyFull; + CurrentChapterNumber = 0; + _currentVerseNumber = 0; + } + + public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) + { + CheckForChapterChange(block); + CheckForVerseChange(block); + if (CurrentStrategy == QuotationMarkUpdateStrategy.Skip) + return block; + if (CurrentStrategy == QuotationMarkUpdateStrategy.ApplyFallback) + { + return ApplyFallbackUpdating(block); + } + return ApplyStandardUpdating(block); + } + + private UsfmUpdateBlock ApplyFallbackUpdating(UsfmUpdateBlock block) + { + foreach (UsfmUpdateBlockElement element in block.Elements) + ProcessScriptureElement(element, _simpleQuotationMarkResolver); + return block; + } + + private UsfmUpdateBlock ApplyStandardUpdating(UsfmUpdateBlock block) + { + foreach (UsfmUpdateBlockElement element in block.Elements) + { + if (element.Type == UsfmUpdateBlockElementType.Embed) + { + _embedQuotationMarkResolver.Reset(); + ProcessScriptureElement(element, _embedQuotationMarkResolver); + } + else + { + ProcessScriptureElement(element, VerseTextQuotationMarkResolver); + } + } + return block; + } + + protected void ProcessScriptureElement( + UsfmUpdateBlockElement element, + IQuotationMarkResolver quotationMarkResolver + ) + { + List textSegments = CreateTextSegments(element); + List quotationMarkMatches = + QuotationMarkFinder.FindAllPotentialQuotationMarksInTextSegments(textSegments); + List resolvedQuotationMarkMatches = quotationMarkResolver + .ResolveQuotationMarks(quotationMarkMatches) + .ToList(); + UpdateQuotationMarks(resolvedQuotationMarkMatches); + } + + protected List CreateTextSegments(UsfmUpdateBlockElement element) + { + var textSegments = new List(); + foreach (UsfmToken token in element.GetTokens()) + { + switch (token.Type) + { + case UsfmTokenType.Verse: + NextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Verse); + break; + case UsfmTokenType.Paragraph: + NextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Paragraph); + break; + case UsfmTokenType.Character: + NextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Character); + break; + case UsfmTokenType.Note: + NextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed); + break; + case UsfmTokenType.Text: + TextSegment textSegment = CreateTextSegment(token); + if (textSegment != null) + textSegments.Add(textSegment); + break; + } + } + return SetPreviousAndNextForSegments(textSegments); + } + + public void UpdateQuotationMarks(List resolvedQuotationMarkMatches) + { + foreach ( + ( + int quotationMarkIndex, + QuotationMarkMetadata resolvedQuotationMarkMatch + ) in resolvedQuotationMarkMatches.Select((r, i) => (i, r)) + ) + { + int previousLength = resolvedQuotationMarkMatch.Length; + resolvedQuotationMarkMatch.UpdateQuotationMark(_targetQuoteConvention); + int updatedLength = resolvedQuotationMarkMatch.Length; + + if (previousLength != updatedLength) + { + ShiftQuotationMarkMetadataIndices( + resolvedQuotationMarkMatches.Skip(quotationMarkIndex + 1).ToList(), + updatedLength - previousLength + ); + } + } + } + + private void ShiftQuotationMarkMetadataIndices( + List quotationMarkMetadataList, + int shiftAmount + ) + { + foreach (QuotationMarkMetadata quotationMarkMetadata in quotationMarkMetadataList) + { + quotationMarkMetadata.ShiftIndices(shiftAmount); + } + } + + protected TextSegment CreateTextSegment(UsfmToken token) + { + TextSegment textSegmentToReturn = null; + NextScriptureTextSegmentBuilder.SetUsfmToken(token); + if (token.Text != null) + { + NextScriptureTextSegmentBuilder.SetText(token.Text); + textSegmentToReturn = NextScriptureTextSegmentBuilder.Build(); + } + NextScriptureTextSegmentBuilder = new TextSegment.Builder(); + return textSegmentToReturn; + } + + protected List SetPreviousAndNextForSegments(List textSegments) + { + for (int i = 0; i < textSegments.Count; i++) + { + if (i > 0) + textSegments[i].PreviousSegment = textSegments[i - 1]; + if (i < textSegments.Count - 1) + textSegments[i].NextSegment = textSegments[i + 1]; + } + return textSegments; + } + + protected void CheckForChapterChange(UsfmUpdateBlock block) + { + foreach (ScriptureRef scriptureRef in block.Refs) + { + if (scriptureRef.ChapterNum != CurrentChapterNumber) + { + StartNewChapter(scriptureRef.ChapterNum); + } + } + } + + protected void StartNewChapter(int newChapterNumber) + { + CurrentChapterNumber = newChapterNumber; + CurrentStrategy = _settings.GetActionForChapter(newChapterNumber); + VerseTextQuotationMarkResolver.Reset(); + NextScriptureTextSegmentBuilder = new TextSegment.Builder(); + NextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Chapter); + } + + private void CheckForVerseChange(UsfmUpdateBlock block) + { + foreach (ScriptureRef scriptureRef in block.Refs) + { + if (scriptureRef.ChapterNum == CurrentChapterNumber && scriptureRef.VerseNum != _currentVerseNumber) + { + StartNewVerse(scriptureRef.VerseNum); + } + } + } + + private void StartNewVerse(int newVerseNumber) + { + _currentVerseNumber = newVerseNumber; + NextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Verse); + } + } +} diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 0da338ef..6883b10d 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -17,13 +17,31 @@ public enum UpdateUsfmMarkerBehavior Strip, } + public class UpdateUsfmRow + { + public IReadOnlyList Refs { get; } + public string Text { get; } + public IReadOnlyDictionary Metadata { get; } + + public UpdateUsfmRow( + IReadOnlyList refs, + string text, + IReadOnlyDictionary metadata = null + ) + { + Refs = refs; + Text = text; + Metadata = metadata ?? new Dictionary(); + } + } + /*** * This is a USFM parser handler that can be used to replace the existing text in a USFM file with the specified * text. */ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase { - private readonly IReadOnlyList<(IReadOnlyList, string)> _rows; + private readonly IReadOnlyList _rows; private readonly List _tokens; private readonly List _updatedText; private readonly List _embedTokens; @@ -41,7 +59,7 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase private int _tokenIndex; public UpdateUsfmParserHandler( - IReadOnlyList<(IReadOnlyList, string)> rows = null, + IReadOnlyList rows = null, string idText = null, UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting, UpdateUsfmMarkerBehavior paragraphBehavior = UpdateUsfmMarkerBehavior.Preserve, @@ -52,7 +70,7 @@ public UpdateUsfmParserHandler( IEnumerable remarks = null ) { - _rows = rows ?? Array.Empty<(IReadOnlyList, string)>(); + _rows = rows ?? Array.Empty(); _tokens = new List(); _updatedText = new List(); _updateBlocks = new Stack(); @@ -89,14 +107,6 @@ public override void StartBook(UsfmParserState state, string marker, string code var startBookTokens = new List(); if (_idText != null) startBookTokens.Add(new UsfmToken(_idText + " ")); - if (_remarks.Count() > 0) - { - foreach (string remark in _remarks) - { - startBookTokens.Add(new UsfmToken(UsfmTokenType.Paragraph, "rem", null, null)); - startBookTokens.Add(new UsfmToken(remark)); - } - } PushUpdatedText(startBookTokens); base.StartBook(state, marker, code); @@ -366,19 +376,49 @@ public string GetUsfm(string stylesheetFileName = "usfm.sty") public string GetUsfm(UsfmStylesheet stylesheet) { var tokenizer = new UsfmTokenizer(stylesheet); - return tokenizer.Detokenize(_tokens); + List tokens = new List(_tokens); + if (_remarks.Count() > 0) + { + var remarkTokens = new List(); + foreach (string remark in _remarks) + { + remarkTokens.Add(new UsfmToken(UsfmTokenType.Paragraph, "rem", null, null)); + remarkTokens.Add(new UsfmToken(remark)); + } + + if (tokens.Count > 0 && tokens[0].Marker == "id") + { + if (tokens.Count > 1 && tokens[1].Type == UsfmTokenType.Text) + { + tokens.InsertRange(2, remarkTokens); + } + else + { + tokens.InsertRange(1, remarkTokens); + } + } + } + return tokenizer.Detokenize(tokens); } - private IReadOnlyList AdvanceRows(IReadOnlyList segScrRefs) + private (IReadOnlyList RowTexts, Dictionary Metadata) AdvanceRows( + IReadOnlyList segScrRefs + ) { var rowTexts = new List(); + Dictionary rowMetadata = null; int sourceIndex = 0; // search the sorted rows with updated text, starting from where we left off last. while (_rowIndex < _rows.Count && sourceIndex < segScrRefs.Count) { // get the set of references for the current row int compare = 0; - (IReadOnlyList rowScrRefs, string text) = _rows[_rowIndex]; + UpdateUsfmRow row = _rows[_rowIndex]; + (IReadOnlyList rowScrRefs, string text, IReadOnlyDictionary metadata) = ( + row.Refs, + row.Text, + row.Metadata + ); foreach (ScriptureRef rowScrRef in rowScrRefs) { while (sourceIndex < segScrRefs.Count) @@ -395,6 +435,7 @@ private IReadOnlyList AdvanceRows(IReadOnlyList segScrRefs // source and row match // grab the text - both source and row will be incremented in due time... rowTexts.Add(text); + rowMetadata = metadata.ToDictionary(kvp => kvp.Key, kvp => kvp.Value); break; } } @@ -404,7 +445,7 @@ private IReadOnlyList AdvanceRows(IReadOnlyList segScrRefs _rowIndex++; } } - return rowTexts; + return (rowTexts, rowMetadata); } private void CollectUpdatableTokens(UsfmParserState state) @@ -508,8 +549,10 @@ private bool HasNewText() private void StartUpdateBlock(IReadOnlyList scriptureRefs) { - _updateBlocks.Push(new UsfmUpdateBlock(scriptureRefs)); - IReadOnlyList rowTexts = AdvanceRows(scriptureRefs); + (IReadOnlyList rowTexts, Dictionary metadata) = AdvanceRows(scriptureRefs); + _updateBlocks.Push( + new UsfmUpdateBlock(scriptureRefs, metadata: metadata ?? new Dictionary()) + ); PushUpdatedText(rowTexts.Select(t => new UsfmToken(t + " "))); } diff --git a/src/SIL.Machine/Corpora/UsfmParserState.cs b/src/SIL.Machine/Corpora/UsfmParserState.cs index 1b0952f2..b6784096 100644 --- a/src/SIL.Machine/Corpora/UsfmParserState.cs +++ b/src/SIL.Machine/Corpora/UsfmParserState.cs @@ -54,7 +54,7 @@ public UsfmParserState(UsfmStylesheet stylesheet, ScrVers versification, IReadOn /// /// Current verse reference /// - public VerseRef VerseRef { get; internal set; } + public VerseRef VerseRef { get; protected internal set; } /// /// Offset of start of token in verse diff --git a/src/SIL.Machine/Corpora/UsfmToken.cs b/src/SIL.Machine/Corpora/UsfmToken.cs index c0b105b9..90b934f2 100644 --- a/src/SIL.Machine/Corpora/UsfmToken.cs +++ b/src/SIL.Machine/Corpora/UsfmToken.cs @@ -1,4 +1,5 @@ -using System.Collections.Generic; +using System; +using System.Collections.Generic; using System.Linq; using System.Text; using System.Text.RegularExpressions; @@ -21,7 +22,7 @@ public enum UsfmTokenType Unknown } - public class UsfmToken + public class UsfmToken : IEquatable { private const string FullAttributeStr = @"(?[-\w]+)\s*\=\s*\""(?.+?)\""\s*"; private static readonly Regex AttributeRegex = new Regex( @@ -64,6 +65,39 @@ public string NestlessMarker get { return Marker != null && Marker[0] == '+' ? Marker.Substring(1) : Marker; } } + public override bool Equals(object obj) + { + if (obj is UsfmToken other) + { + return Equals(other); + } + return false; + } + + public bool Equals(UsfmToken other) + { + return Type == other.Type + && Marker == other.Marker + && Text == other.Text + && EndMarker == other.EndMarker + && Data == other.Data + && LineNumber == other.LineNumber + && ColumnNumber == other.ColumnNumber; + } + + public override int GetHashCode() + { + int hashCode = 23; + hashCode = hashCode * 31 + Type.GetHashCode(); + hashCode = hashCode * 31 + (Marker?.GetHashCode() ?? 0); + hashCode = hashCode * 31 + (Text?.GetHashCode() ?? 0); + hashCode = hashCode * 31 + (EndMarker?.GetHashCode() ?? 0); + hashCode = hashCode * 31 + (Data?.GetHashCode() ?? 0); + hashCode = hashCode * 31 + LineNumber.GetHashCode(); + hashCode = hashCode * 31 + ColumnNumber.GetHashCode(); + return hashCode; + } + public string GetAttribute(string name) { if (Attributes == null || Attributes.Count == 0) diff --git a/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs b/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs index 6640e96a..22140729 100644 --- a/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs +++ b/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs @@ -13,17 +13,24 @@ public IReadOnlyList Elements { get => _elements; } + public IReadOnlyDictionary Metadata + { + get => _metadata; + } private readonly List _refs; private readonly List _elements; + private readonly Dictionary _metadata; public UsfmUpdateBlock( IEnumerable refs = null, - IEnumerable elements = null + IEnumerable elements = null, + Dictionary metadata = null ) { - _refs = refs != null ? refs.ToList() : new List(); - _elements = elements != null ? elements.ToList() : new List(); + _refs = refs?.ToList() ?? new List(); + _elements = elements?.ToList() ?? new List(); + _metadata = metadata ?? new Dictionary(); } public void AddText(IEnumerable tokens) @@ -100,7 +107,10 @@ public override bool Equals(object obj) UsfmUpdateBlock other = (UsfmUpdateBlock)obj; - return _refs.SequenceEqual(other._refs) && _elements.SequenceEqual(other._elements); + return _refs.SequenceEqual(other._refs) + && _elements.SequenceEqual(other._elements) + && _metadata.Count == other.Metadata.Count + && !_metadata.Except(other.Metadata).Any(); } public override int GetHashCode() diff --git a/src/SIL.Machine/Corpora/ZipParatextProjectQuoteConventionDetector.cs b/src/SIL.Machine/Corpora/ZipParatextProjectQuoteConventionDetector.cs new file mode 100644 index 00000000..91736056 --- /dev/null +++ b/src/SIL.Machine/Corpora/ZipParatextProjectQuoteConventionDetector.cs @@ -0,0 +1,29 @@ +using System.IO; +using System.IO.Compression; + +namespace SIL.Machine.Corpora +{ + public class ZipParatextProjectQuoteConventionDetector : ParatextProjectQuoteConventionDetector + { + private readonly ZipArchive _archive; + + public ZipParatextProjectQuoteConventionDetector(ZipArchive archive) + : base(new ZipParatextProjectSettingsParser(archive)) + { + _archive = archive; + } + + protected override bool Exists(string fileName) + { + return _archive.GetEntry(fileName) != null; + } + + protected override Stream Open(string fileName) + { + ZipArchiveEntry entry = _archive.GetEntry(fileName); + if (entry == null) + return null; + return entry.Open(); + } + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/Chapter.cs b/src/SIL.Machine/PunctuationAnalysis/Chapter.cs new file mode 100644 index 00000000..a5c5bc62 --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/Chapter.cs @@ -0,0 +1,15 @@ +using System.Collections.Generic; +using System.Linq; + +namespace SIL.Machine.PunctuationAnalysis +{ + public class Chapter + { + public Chapter(IEnumerable verses) + { + Verses = verses.ToList(); + } + + public List Verses { get; set; } + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs b/src/SIL.Machine/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs new file mode 100644 index 00000000..e48e6a7e --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/DepthBasedQuotationMarkResolver.cs @@ -0,0 +1,553 @@ +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using System.Text.RegularExpressions; + +namespace SIL.Machine.PunctuationAnalysis +{ + public class QuotationMarkResolverState + { + public ImmutableStack Quotations + { + get => ImmutableStack.CreateRange(_quotations); + } + + private readonly Stack _quotations; + + public QuotationMarkResolverState() + { + _quotations = new Stack(); + } + + public void Reset() + { + _quotations.Clear(); + } + + public int CurrentDepth => _quotations.Count; + + public bool HasOpenQuotationMark => CurrentDepth > 0; + + public bool AreMoreThanNQuotesOpen(int n) => CurrentDepth > n; + + public QuotationMarkMetadata AddOpeningQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + QuotationMarkMetadata quotationMark = quotationMarkMatch.Resolve( + CurrentDepth + 1, + QuotationMarkDirection.Opening + ); + _quotations.Push(quotationMark); + return quotationMark; + } + + public QuotationMarkMetadata AddClosingQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + QuotationMarkMetadata quotationMark = quotationMarkMatch.Resolve( + CurrentDepth, + QuotationMarkDirection.Closing + ); + _quotations.Pop(); + return quotationMark; + } + + public string GetOpeningQuotationMarkAtDepth(int depth) + { + if (depth > CurrentDepth) + { + throw new InvalidOperationException( + $"Opening quotation mark at depth ${depth} was requested from a quotation stack with depth ${CurrentDepth}." + ); + } + // Stack is stored in reverse order + return _quotations.ToArray()[CurrentDepth - depth].QuotationMark; + } + + public string GetDeepestOpeningQuotationMark() + { + if (!HasOpenQuotationMark) + { + throw new InvalidOperationException( + "The deepest opening quotation mark was requested from an empty quotation stack." + ); + } + return _quotations.Peek().QuotationMark; + } + } + + public enum QuoteContinuerStyle + { + Undetermined, + English, + Spanish + } + + public class QuoteContinuerState + { + private readonly Stack _quoteContinuerMarks; + public ImmutableStack QuoteContinuerMarks + { + get => ImmutableStack.CreateRange(_quoteContinuerMarks); + } + public QuoteContinuerStyle ContinuerStyle { get; protected set; } + public int CurrentDepth => _quoteContinuerMarks.Count; + + public QuoteContinuerState() + { + _quoteContinuerMarks = new Stack(); + ContinuerStyle = QuoteContinuerStyle.Undetermined; + } + + public void Reset() + { + _quoteContinuerMarks.Clear(); + ContinuerStyle = QuoteContinuerStyle.Undetermined; + } + + public bool ContinuerHasBeenObserved() + { + return _quoteContinuerMarks.Count > 0; + } + + public QuotationMarkMetadata AddQuoteContinuer( + QuotationMarkStringMatch quotationMarkMatch, + QuotationMarkResolverState quotationMarkResolverState, + QuoteContinuerStyle quoteContinuerStyle + ) + { + QuotationMarkMetadata quote = quotationMarkMatch.Resolve( + _quoteContinuerMarks.Count + 1, + QuotationMarkDirection.Opening + ); + _quoteContinuerMarks.Push(quote); + ContinuerStyle = quoteContinuerStyle; + if (CurrentDepth == quotationMarkResolverState.CurrentDepth) + { + _quoteContinuerMarks.Clear(); + } + return quote; + } + } + + public class QuotationMarkCategorizer + { + private static readonly Regex ApostrophePattern = new Regex(@"[\'\u2019\u2018]", RegexOptions.Compiled); + private readonly IQuotationMarkResolutionSettings _settings; + private readonly QuotationMarkResolverState _quotationMarkResolverState; + private readonly QuoteContinuerState _quoteContinuerState; + + public QuotationMarkCategorizer( + IQuotationMarkResolutionSettings quotationMarkResolutionSettings, + QuotationMarkResolverState quotationMarkResolverState, + QuoteContinuerState quotationContinuerState + ) + { + _settings = quotationMarkResolutionSettings; + _quotationMarkResolverState = quotationMarkResolverState; + _quoteContinuerState = quotationContinuerState; + } + + public bool IsEnglishQuoteContinuer( + QuotationMarkStringMatch quotationMarkMatch, + QuotationMarkStringMatch previousMatch, + QuotationMarkStringMatch nextMatch + ) + { + if (_quoteContinuerState.ContinuerStyle == QuoteContinuerStyle.Spanish) + return false; + if (!MeetsQuoteContinuerPrerequisites(quotationMarkMatch)) + return false; + if ( + quotationMarkMatch.QuotationMark + != _quotationMarkResolverState.GetOpeningQuotationMarkAtDepth(_quoteContinuerState.CurrentDepth + 1) + ) + { + return false; + } + if (!_quoteContinuerState.ContinuerHasBeenObserved()) + { + if (quotationMarkMatch.StartIndex > 0) + return false; + + // Check the next quotation mark match, since quote continuers must appear consecutively + if (_quotationMarkResolverState.AreMoreThanNQuotesOpen(1)) + { + if (nextMatch == null || nextMatch.StartIndex != quotationMarkMatch.EndIndex) + return false; + } + } + return true; + } + + public bool IsSpanishQuoteContinuer( + QuotationMarkStringMatch quotationMarkMatch, + QuotationMarkStringMatch previousMatch, + QuotationMarkStringMatch nextMatch + ) + { + if (_quoteContinuerState.ContinuerStyle == QuoteContinuerStyle.English) + return false; + if (!MeetsQuoteContinuerPrerequisites(quotationMarkMatch)) + return false; + + if ( + !_settings.AreMarksAValidPair( + _quotationMarkResolverState.GetOpeningQuotationMarkAtDepth(_quoteContinuerState.CurrentDepth + 1), + quotationMarkMatch.QuotationMark + ) + ) + { + return false; + } + + if (!_quoteContinuerState.ContinuerHasBeenObserved()) + { + if (quotationMarkMatch.StartIndex > 0) + return false; + + // This has only been observed with guillemets so far + if (quotationMarkMatch.QuotationMark != "»") + return false; + + // Check the next quotation mark match, since quote continuers must appear consecutively + if (_quotationMarkResolverState.AreMoreThanNQuotesOpen(1)) + { + if (nextMatch == null || nextMatch.StartIndex != quotationMarkMatch.EndIndex) + return false; + } + } + return true; + } + + private bool MeetsQuoteContinuerPrerequisites(QuotationMarkStringMatch quotationMarkMatch) + { + if (_quoteContinuerState.CurrentDepth >= _quotationMarkResolverState.CurrentDepth) + return false; + if ( + _settings.ShouldRelyOnParagraphMarkers() + && !quotationMarkMatch.TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph) + ) + { + return false; + } + if (!_quotationMarkResolverState.HasOpenQuotationMark) + return false; + return true; + } + + public bool IsOpeningQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + if (!_settings.IsValidOpeningQuotationMark(quotationMarkMatch)) + return false; + + // If the quote is ambiguous, use whitespace as clue + if (_settings.IsValidClosingQuotationMark(quotationMarkMatch)) + { + return ( + quotationMarkMatch.HasLeadingWhitespace() + || MostRecentOpeningMarkImmediatelyPrecedes(quotationMarkMatch) + || quotationMarkMatch.HasQuoteIntroducerInLeadingSubstring() + ) && !(quotationMarkMatch.HasTrailingWhitespace() || quotationMarkMatch.HasTrailingPunctuation()); + } + return true; + } + + public bool IsClosingQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + if (!_settings.IsValidClosingQuotationMark(quotationMarkMatch)) + return false; + + // If the quote is ambiguous, use whitespace as clue + if (_settings.IsValidOpeningQuotationMark(quotationMarkMatch)) + { + return ( + quotationMarkMatch.HasTrailingWhitespace() + || quotationMarkMatch.HasTrailingPunctuation() + || quotationMarkMatch.IsAtEndOfSegment + || quotationMarkMatch.NextCharacterMatches(_settings.GetClosingQuotationMarkRegex()) + ) && !quotationMarkMatch.HasLeadingWhitespace(); + } + return true; + } + + public bool IsMalformedOpeningQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + if (!_settings.IsValidOpeningQuotationMark(quotationMarkMatch)) + return false; + + if (quotationMarkMatch.HasQuoteIntroducerInLeadingSubstring()) + return true; + + if ( + quotationMarkMatch.HasLeadingWhitespace() + && quotationMarkMatch.HasTrailingWhitespace() + && !_quotationMarkResolverState.HasOpenQuotationMark + ) + { + return true; + } + + return false; + } + + public bool IsMalformedClosingQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + if (!_settings.IsValidClosingQuotationMark(quotationMarkMatch)) + return false; + + return ( + ( + quotationMarkMatch.IsAtEndOfSegment + || !quotationMarkMatch.HasTrailingWhitespace() + || (quotationMarkMatch.HasLeadingWhitespace() && quotationMarkMatch.HasTrailingWhitespace()) + ) + && _quotationMarkResolverState.HasOpenQuotationMark + && _settings.AreMarksAValidPair( + _quotationMarkResolverState.GetDeepestOpeningQuotationMark(), + quotationMarkMatch.QuotationMark + ) + ); + } + + public bool IsUnpairedClosingQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + if (!_settings.IsValidClosingQuotationMark(quotationMarkMatch)) + return false; + + if (_quotationMarkResolverState.HasOpenQuotationMark) + return false; + + return !quotationMarkMatch.HasLeadingWhitespace() + && (quotationMarkMatch.IsAtEndOfSegment || quotationMarkMatch.HasTrailingWhitespace()); + } + + private bool MostRecentOpeningMarkImmediatelyPrecedes(QuotationMarkStringMatch quotationMarkMatch) + { + if (!_quotationMarkResolverState.HasOpenQuotationMark) + return false; + + return _quotationMarkResolverState.GetDeepestOpeningQuotationMark() == quotationMarkMatch.PreviousCharacter; + } + + public bool IsApostrophe(QuotationMarkStringMatch quotationMarkMatch, QuotationMarkStringMatch nextMatch) + { + if (!quotationMarkMatch.QuotationMarkMatches(ApostrophePattern)) + return false; + + // Latin letters on both sides of punctuation mark + if ( + quotationMarkMatch.PreviousCharacter != null + && quotationMarkMatch.HasLeadingLatinLetter() + && quotationMarkMatch.NextCharacter != null + && quotationMarkMatch.HasTrailingLatinLetter() + ) + { + return true; + } + + // Potential final s possessive (e.g. Moses') + if ( + quotationMarkMatch.PreviousCharacterMatches(new Regex(@"s", RegexOptions.Compiled)) + && (quotationMarkMatch.HasTrailingWhitespace() || quotationMarkMatch.HasTrailingPunctuation()) + ) + { + // Check whether it could be a closing quotation mark + if (!_quotationMarkResolverState.HasOpenQuotationMark) + return true; + if ( + !_settings.AreMarksAValidPair( + _quotationMarkResolverState.GetDeepestOpeningQuotationMark(), + quotationMarkMatch.QuotationMark + ) + ) + { + return true; + } + if ( + nextMatch != null + && _settings.AreMarksAValidPair( + _quotationMarkResolverState.GetDeepestOpeningQuotationMark(), + nextMatch.QuotationMark + ) + ) + { + return true; + } + } + + // For languages that use apostrophes at the start and end of words + if ( + !_quotationMarkResolverState.HasOpenQuotationMark && quotationMarkMatch.QuotationMark == "'" + || _quotationMarkResolverState.HasOpenQuotationMark + && !_settings.AreMarksAValidPair( + _quotationMarkResolverState.GetDeepestOpeningQuotationMark(), + quotationMarkMatch.QuotationMark + ) + ) + { + return true; + } + + return false; + } + } + + public class DepthBasedQuotationMarkResolver : IQuotationMarkResolver + { + public IQuotationMarkResolutionSettings Settings { get; } + public QuotationMarkResolverState QuotationMarkResolverState { get; } + public QuoteContinuerState QuoteContinuerState { get; } + public QuotationMarkCategorizer QuotationMarkCategorizer { get; } + protected HashSet Issues { get; } + + public DepthBasedQuotationMarkResolver(IQuotationMarkResolutionSettings settings) + { + Settings = settings; + QuotationMarkResolverState = new QuotationMarkResolverState(); + QuoteContinuerState = new QuoteContinuerState(); + QuotationMarkCategorizer = new QuotationMarkCategorizer( + Settings, + QuotationMarkResolverState, + QuoteContinuerState + ); + Issues = new HashSet(); + } + + public virtual void Reset() + { + QuotationMarkResolverState.Reset(); + QuoteContinuerState.Reset(); + Issues.Clear(); + } + + public virtual IEnumerable ResolveQuotationMarks( + IReadOnlyList quotationMarkMatches + ) + { + foreach ( + (int index, QuotationMarkStringMatch quotationMarkMatch) in quotationMarkMatches.Select( + (q, i) => (i, q) + ) + ) + { + QuotationMarkStringMatch previousMark = index == 0 ? null : quotationMarkMatches[index - 1]; + QuotationMarkStringMatch nextMark = + index == quotationMarkMatches.Count - 1 ? null : quotationMarkMatches[index + 1]; + foreach (QuotationMarkMetadata q in ResolveQuotationMark(quotationMarkMatch, previousMark, nextMark)) + yield return q; + } + if (QuotationMarkResolverState.HasOpenQuotationMark) + Issues.Add(QuotationMarkResolutionIssue.UnpairedQuotationMark); + } + + public IEnumerable ResolveQuotationMark( + QuotationMarkStringMatch quotationMarkMatch, + QuotationMarkStringMatch previousMatch, + QuotationMarkStringMatch nextMatch + ) + { + if (QuotationMarkCategorizer.IsOpeningQuotationMark(quotationMarkMatch)) + { + if (QuotationMarkCategorizer.IsEnglishQuoteContinuer(quotationMarkMatch, previousMatch, nextMatch)) + { + yield return ProcessQuoteContinuer(quotationMarkMatch, QuoteContinuerStyle.English); + } + else + { + if (IsDepthTooGreat()) + { + Issues.Add(QuotationMarkResolutionIssue.TooDeepNesting); + yield break; + } + + yield return ProcessOpeningMark(quotationMarkMatch); + } + } + else if (QuotationMarkCategorizer.IsApostrophe(quotationMarkMatch, nextMatch)) { } + else if (QuotationMarkCategorizer.IsClosingQuotationMark(quotationMarkMatch)) + { + if (QuotationMarkCategorizer.IsSpanishQuoteContinuer(quotationMarkMatch, previousMatch, nextMatch)) + { + yield return ProcessQuoteContinuer(quotationMarkMatch, QuoteContinuerStyle.Spanish); + } + else if (!QuotationMarkResolverState.HasOpenQuotationMark) + { + Issues.Add(QuotationMarkResolutionIssue.UnpairedQuotationMark); + yield break; + } + else + { + yield return ProcessClosingMark(quotationMarkMatch); + } + } + else if (QuotationMarkCategorizer.IsMalformedClosingQuotationMark(quotationMarkMatch)) + { + yield return ProcessClosingMark(quotationMarkMatch); + } + else if (QuotationMarkCategorizer.IsMalformedOpeningQuotationMark(quotationMarkMatch)) + { + yield return ProcessOpeningMark(quotationMarkMatch); + } + else if (QuotationMarkCategorizer.IsUnpairedClosingQuotationMark(quotationMarkMatch)) + { + Issues.Add(QuotationMarkResolutionIssue.UnpairedQuotationMark); + } + else + { + Issues.Add(QuotationMarkResolutionIssue.AmbiguousQuotationMark); + } + } + + private QuotationMarkMetadata ProcessQuoteContinuer( + QuotationMarkStringMatch quotationMarkMatch, + QuoteContinuerStyle continuerStyle + ) + { + return QuoteContinuerState.AddQuoteContinuer( + quotationMarkMatch, + QuotationMarkResolverState, + continuerStyle + ); + } + + private bool IsDepthTooGreat() + { + return QuotationMarkResolverState.AreMoreThanNQuotesOpen(3); + } + + private QuotationMarkMetadata ProcessOpeningMark(QuotationMarkStringMatch quotationMarkMatch) + { + if ( + !Settings.MetadataMatchesQuotationMark( + quotationMarkMatch.QuotationMark, + QuotationMarkResolverState.CurrentDepth + 1, + QuotationMarkDirection.Opening + ) + ) + { + Issues.Add(QuotationMarkResolutionIssue.IncompatibleQuotationMark); + } + return QuotationMarkResolverState.AddOpeningQuotationMark(quotationMarkMatch); + } + + private QuotationMarkMetadata ProcessClosingMark(QuotationMarkStringMatch quotationMarkMatch) + { + if ( + !Settings.MetadataMatchesQuotationMark( + quotationMarkMatch.QuotationMark, + QuotationMarkResolverState.CurrentDepth, + QuotationMarkDirection.Closing + ) + ) + { + Issues.Add(QuotationMarkResolutionIssue.IncompatibleQuotationMark); + } + return QuotationMarkResolverState.AddClosingQuotationMark(quotationMarkMatch); + } + + public virtual HashSet GetIssues() + { + return Issues; + } + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/IQuotationMarkResolutionSettings.cs b/src/SIL.Machine/PunctuationAnalysis/IQuotationMarkResolutionSettings.cs new file mode 100644 index 00000000..4e8f4721 --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/IQuotationMarkResolutionSettings.cs @@ -0,0 +1,17 @@ +using System.Collections.Generic; +using System.Text.RegularExpressions; + +namespace SIL.Machine.PunctuationAnalysis +{ + public interface IQuotationMarkResolutionSettings + { + bool IsValidOpeningQuotationMark(QuotationMarkStringMatch quotationMarkMatch); + bool IsValidClosingQuotationMark(QuotationMarkStringMatch quotationMarkMatch); + Regex GetOpeningQuotationMarkRegex(); + Regex GetClosingQuotationMarkRegex(); + bool AreMarksAValidPair(string openingMark, string closingMark); + bool ShouldRelyOnParagraphMarkers(); + HashSet GetPossibleDepths(string quotationMark, QuotationMarkDirection direction); + bool MetadataMatchesQuotationMark(string quotationMark, int depth, QuotationMarkDirection direction); + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/IQuotationMarkResolver.cs b/src/SIL.Machine/PunctuationAnalysis/IQuotationMarkResolver.cs new file mode 100644 index 00000000..d34e95ba --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/IQuotationMarkResolver.cs @@ -0,0 +1,13 @@ +using System.Collections.Generic; + +namespace SIL.Machine.PunctuationAnalysis +{ + public interface IQuotationMarkResolver + { + IEnumerable ResolveQuotationMarks( + IReadOnlyList quotationMarkMatches + ); + void Reset(); + HashSet GetIssues(); + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs b/src/SIL.Machine/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs new file mode 100644 index 00000000..867119a0 --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs @@ -0,0 +1,493 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; +using SIL.Extensions; + +namespace SIL.Machine.PunctuationAnalysis +{ + public class ApostropheProportionStatistics + { + private int _numCharacters; + private int _numApostrophes; + + public ApostropheProportionStatistics() + { + Reset(); + } + + public void Reset() + { + _numCharacters = 0; + _numApostrophes = 0; + } + + public void CountCharacters(TextSegment textSegment) + { + _numCharacters += textSegment.Length; + } + + public void AddApostrophe() + { + _numApostrophes++; + } + + public bool IsApostropheProportionGreaterThan(double threshold) + { + if (_numCharacters == 0) + return false; + return ((double)_numApostrophes / _numCharacters) > threshold; + } + } + + public class QuotationMarkWordPositions + { + private static readonly double MaximumProportionForRarity = 0.1; + private static readonly double MaximumProportionDifferenceThreshold = 0.3; + private readonly Dictionary _wordInitialOccurrences; + private readonly Dictionary _midWordOccurrences; + private readonly Dictionary _wordFinalOccurrences; + private readonly Dictionary _totalOccurrences; + + public QuotationMarkWordPositions() + { + _wordInitialOccurrences = new Dictionary(); + _midWordOccurrences = new Dictionary(); + _wordFinalOccurrences = new Dictionary(); + _totalOccurrences = new Dictionary(); + } + + public void Reset() + { + _wordInitialOccurrences.Clear(); + _midWordOccurrences.Clear(); + _wordFinalOccurrences.Clear(); + _totalOccurrences.Clear(); + } + + public void CountWordInitialApostrophe(string quotationMark) + { + _wordInitialOccurrences.UpdateValue(quotationMark, () => 0, i => i + 1); + _totalOccurrences.UpdateValue(quotationMark, () => 0, i => i + 1); + } + + public void CountMidWordApostrophe(string quotationMark) + { + _midWordOccurrences.UpdateValue(quotationMark, () => 0, i => i + 1); + _totalOccurrences.UpdateValue(quotationMark, () => 0, i => i + 1); + } + + public void CountWordFinalApostrophe(string quotationMark) + { + _wordFinalOccurrences.UpdateValue(quotationMark, () => 0, i => i + 1); + _totalOccurrences.UpdateValue(quotationMark, () => 0, i => i + 1); + } + + private int GetWordInitialOccurrences(string quotationMark) + { + return _wordInitialOccurrences.TryGetValue(quotationMark, out int count) ? count : 0; + } + + private int GetMidWordOccurrences(string quotationMark) + { + return _midWordOccurrences.TryGetValue(quotationMark, out int count) ? count : 0; + } + + private int GetWordFinalOccurrences(string quotationMark) + { + return _wordFinalOccurrences.TryGetValue(quotationMark, out int count) ? count : 0; + } + + private int GetTotalOccurrences(string quotationMark) + { + return GetWordInitialOccurrences(quotationMark) + + GetMidWordOccurrences(quotationMark) + + GetWordFinalOccurrences(quotationMark); + } + + public bool IsMarkRarelyInitial(string quotationMark) + { + int numInitialMarks = GetWordInitialOccurrences(quotationMark); + int numTotalMarks = GetTotalOccurrences(quotationMark); + return numTotalMarks > 0 && ((double)numInitialMarks / numTotalMarks) < MaximumProportionForRarity; + } + + public bool IsMarkRarelyFinal(string quotationMark) + { + int numFinalMarks = GetWordFinalOccurrences(quotationMark); + int numTotalMarks = GetTotalOccurrences(quotationMark); + return numTotalMarks > 0 && ((double)numFinalMarks / numTotalMarks) < MaximumProportionForRarity; + } + + public bool AreInitialAndFinalRatesSimilar(string quotationMark) + { + int numInitialMarks = GetWordInitialOccurrences(quotationMark); + int numFinalMarks = GetWordFinalOccurrences(quotationMark); + int numTotalMarks = GetTotalOccurrences(quotationMark); + return numTotalMarks > 0 + && ((double)Math.Abs(numInitialMarks - numFinalMarks) / numTotalMarks) + < MaximumProportionDifferenceThreshold; + } + + public bool IsMarkCommonlyMidWord(string quotationMark) + { + int numMidWordMarks = GetMidWordOccurrences(quotationMark); + int numTotalMarks = GetTotalOccurrences(quotationMark); + return numTotalMarks > 0 + && ((double)numMidWordMarks / numTotalMarks) > MaximumProportionDifferenceThreshold; + } + } + + public class QuotationMarkSequences + { + private static readonly int SoleOccurrenceMinimumCount = 5; + private static readonly int MuchMoreCommonMinimumRatio = 10; + private static readonly double MaximumProportionDifferenceThreshold = 0.2; + + private readonly Dictionary _earlierQuotationMarkCounts; + private readonly Dictionary _laterQuotationMarkCounts; + + public QuotationMarkSequences() + { + _earlierQuotationMarkCounts = new Dictionary(); + _laterQuotationMarkCounts = new Dictionary(); + } + + public void Reset() + { + _earlierQuotationMarkCounts.Clear(); + _laterQuotationMarkCounts.Clear(); + } + + public void CountEarlierQuotationMark(string quotationMark) + { + _earlierQuotationMarkCounts.UpdateValue(quotationMark, () => 0, i => i + 1); + } + + public void CountLaterQuotationMark(string quotationMark) + { + _laterQuotationMarkCounts.UpdateValue(quotationMark, () => 0, i => i + 1); + } + + private int GetEarlierOccurrences(string quotationMark) + { + return _earlierQuotationMarkCounts.TryGetValue(quotationMark, out int count) ? count : 0; + } + + private int GetLaterOccurrences(string quotationMark) + { + return _laterQuotationMarkCounts.TryGetValue(quotationMark, out int count) ? count : 0; + } + + public bool IsMarkMuchMoreCommonEarlier(string quotationMark) + { + int numEarlyOccurrences = GetEarlierOccurrences(quotationMark); + int numLateOccurrences = GetLaterOccurrences(quotationMark); + return (numLateOccurrences == 0 && numEarlyOccurrences > SoleOccurrenceMinimumCount) + || numEarlyOccurrences > numLateOccurrences * MuchMoreCommonMinimumRatio; + } + + public bool IsMarkMuchMoreCommonLater(string quotationMark) + { + int numEarlyOccurrences = GetEarlierOccurrences(quotationMark); + int numLateOccurrences = GetLaterOccurrences(quotationMark); + return (numEarlyOccurrences == 0 && numLateOccurrences > SoleOccurrenceMinimumCount) + || numLateOccurrences > numEarlyOccurrences * MuchMoreCommonMinimumRatio; + } + + public bool AreEarlyAndLateMarkRatesSimilar(string quotationMark) + { + int numEarlyOccurrences = GetEarlierOccurrences(quotationMark); + int numLateOccurrences = GetLaterOccurrences(quotationMark); + return numEarlyOccurrences > 0 + && ((double)Math.Abs(numLateOccurrences - numEarlyOccurrences) / numEarlyOccurrences) + < MaximumProportionDifferenceThreshold; + } + } + + public class QuotationMarkGrouper + { + private readonly QuoteConventionSet _quoteConventions; + private readonly Dictionary> _groupedQuotationMarks; + + public QuotationMarkGrouper( + List quotationMarks, + QuoteConventionSet quoteConventionSet + ) + { + _quoteConventions = quoteConventionSet; + _groupedQuotationMarks = GroupQuotationMarks(quotationMarks); + } + + private Dictionary> GroupQuotationMarks( + List quotationMarks + ) + { + return quotationMarks.GroupBy(qmm => qmm.QuotationMark).ToDictionary(g => g.Key, g => g.ToList()); + } + + public IEnumerable<(string Mark1, string Mark2)> GetQuotationMarkPairs() + { + foreach ( + (string mark1, List matches1) in _groupedQuotationMarks.Select(kvp => + (kvp.Key, kvp.Value) + ) + ) + { + // Handle cases of identical opening/closing marks + if ( + matches1.Count == 2 + && _quoteConventions.IsQuotationMarkDirectionAmbiguous(mark1) + && !HasDistinctPairedQuotationMark(mark1) + ) + { + yield return (mark1, mark1); + continue; + } + + // Skip verses where quotation mark pairs are ambiguous + if (matches1.Count > 1) + continue; + + // Find matching closing marks + foreach ( + (string mark2, List matches2) in _groupedQuotationMarks.Select(kvp => + (kvp.Key, kvp.Value) + ) + ) + { + if ( + matches2.Count == 1 + && _quoteConventions.MarksAreAValidPair(mark1, mark2) + && matches1[0].Precedes(matches2[0]) + ) + { + yield return (mark1, mark2); + } + } + } + } + + public bool HasDistinctPairedQuotationMark(string quotationMark) + { + return _quoteConventions + .GetPossiblePairedQuotationMarks(quotationMark) + .Any(m => m != quotationMark && _groupedQuotationMarks.ContainsKey(m)); + } + } + + public class PreliminaryApostropheAnalyzer + { + private static readonly double MaximumApostropheProportion = 0.02; + private static readonly Regex ApostrophePattern = new Regex(@"[\'\u2019]", RegexOptions.Compiled); + private readonly ApostropheProportionStatistics _apostropheProportionStatistics; + private readonly QuotationMarkWordPositions _wordPositionStatistics; + + public PreliminaryApostropheAnalyzer() + { + _apostropheProportionStatistics = new ApostropheProportionStatistics(); + _wordPositionStatistics = new QuotationMarkWordPositions(); + Reset(); + } + + public void Reset() + { + _apostropheProportionStatistics.Reset(); + _wordPositionStatistics.Reset(); + } + + public void ProcessQuotationMarks(List textSegments, List quotationMarks) + { + foreach (TextSegment textSegment in textSegments) + _apostropheProportionStatistics.CountCharacters(textSegment); + foreach (QuotationMarkStringMatch quotationMarkMatch in quotationMarks) + ProcessQuotationMark(quotationMarkMatch); + } + + private void ProcessQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + if (quotationMarkMatch.QuotationMarkMatches(ApostrophePattern)) + CountApostrophe(quotationMarkMatch); + } + + private void CountApostrophe(QuotationMarkStringMatch apostropheMatch) + { + string apostrophe = apostropheMatch.QuotationMark; + _apostropheProportionStatistics.AddApostrophe(); + if (IsMatchWordInitial(apostropheMatch)) + { + _wordPositionStatistics.CountWordInitialApostrophe(apostrophe); + } + else if (IsMatchMidWord(apostropheMatch)) + { + _wordPositionStatistics.CountMidWordApostrophe(apostrophe); + } + else if (IsMatchWordFinal(apostropheMatch)) + { + _wordPositionStatistics.CountWordFinalApostrophe(apostrophe); + } + } + + private bool IsMatchWordInitial(QuotationMarkStringMatch apostropheMatch) + { + if (apostropheMatch.HasTrailingWhitespace()) + return false; + if (!apostropheMatch.IsAtStartOfSegment && !apostropheMatch.HasLeadingWhitespace()) + return false; + return true; + } + + private bool IsMatchMidWord(QuotationMarkStringMatch apostropheMatch) + { + if (apostropheMatch.HasTrailingWhitespace()) + return false; + if (apostropheMatch.HasLeadingWhitespace()) + return false; + return true; + } + + private bool IsMatchWordFinal(QuotationMarkStringMatch apostropheMatch) + { + if (!apostropheMatch.IsAtEndOfSegment && !apostropheMatch.HasTrailingWhitespace()) + return false; + if (apostropheMatch.HasLeadingWhitespace()) + return false; + return true; + } + + public bool IsApostropheOnly(string mark) + { + if (!ApostrophePattern.IsMatch(mark)) + return false; + + if (_wordPositionStatistics.IsMarkRarelyInitial(mark) || _wordPositionStatistics.IsMarkRarelyInitial(mark)) + return true; + + if ( + _wordPositionStatistics.AreInitialAndFinalRatesSimilar(mark) + && _wordPositionStatistics.IsMarkCommonlyMidWord(mark) + ) + { + return true; + } + + if (_apostropheProportionStatistics.IsApostropheProportionGreaterThan(MaximumApostropheProportion)) + { + return true; + } + + return false; + } + } + + public class PreliminaryQuotationMarkAnalyzer + { + private readonly QuoteConventionSet _quoteConventions; + private readonly PreliminaryApostropheAnalyzer _apostropheAnalyzer; + private readonly QuotationMarkSequences _quotationMarkSequences; + + public PreliminaryQuotationMarkAnalyzer(QuoteConventionSet quoteConventions) + { + _quoteConventions = quoteConventions; + _apostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + _quotationMarkSequences = new QuotationMarkSequences(); + Reset(); + } + + public void Reset() + { + _apostropheAnalyzer.Reset(); + _quotationMarkSequences.Reset(); + } + + public QuoteConventionSet NarrowDownPossibleQuoteConventions(List chapters) + { + foreach (Chapter chapter in chapters) + AnalyzeQuotationMarksForChapter(chapter); + return SelectCompatibleQuoteConventions(); + } + + private void AnalyzeQuotationMarksForChapter(Chapter chapter) + { + foreach (Verse verse in chapter.Verses) + AnalyzeQuotationMarksForVerse(verse); + } + + private void AnalyzeQuotationMarksForVerse(Verse verse) + { + List quotationMarks = new QuotationMarkFinder( + _quoteConventions + ).FindAllPotentialQuotationMarksInVerse(verse); + AnalyzeQuotationMarkSequence(quotationMarks); + _apostropheAnalyzer.ProcessQuotationMarks(verse.TextSegments.ToList(), quotationMarks); + } + + private void AnalyzeQuotationMarkSequence(List quotationMarks) + { + var quotationMarkGrouper = new QuotationMarkGrouper(quotationMarks, _quoteConventions); + foreach ((string earlierMark, string laterMark) in quotationMarkGrouper.GetQuotationMarkPairs()) + { + _quotationMarkSequences.CountEarlierQuotationMark(earlierMark); + _quotationMarkSequences.CountLaterQuotationMark(laterMark); + } + } + + public QuoteConventionSet SelectCompatibleQuoteConventions() + { + List openingQuotationMarks = FindOpeningQuotationMarks(); + List closingQuotationMarks = FindClosingQuotationMarks(); + + return _quoteConventions.FilterToCompatibleQuoteConventions(openingQuotationMarks, closingQuotationMarks); + } + + private List FindOpeningQuotationMarks() + { + return _quoteConventions + .GetPossibleOpeningQuotationMarks() + .Where(qm => IsOpeningQuotationMark(qm)) + .ToList(); + } + + private bool IsOpeningQuotationMark(string quotationMark) + { + if (_apostropheAnalyzer.IsApostropheOnly(quotationMark)) + return false; + + if (_quotationMarkSequences.IsMarkMuchMoreCommonEarlier(quotationMark)) + return true; + if ( + _quotationMarkSequences.AreEarlyAndLateMarkRatesSimilar(quotationMark) + && _quoteConventions.IsQuotationMarkDirectionAmbiguous(quotationMark) + ) + { + return true; + } + return false; + } + + private List FindClosingQuotationMarks() + { + return _quoteConventions + .GetPossibleClosingQuotationMarks() + .Where(qm => IsClosingQuotationMark(qm)) + .ToList(); + } + + private bool IsClosingQuotationMark(string quotationMark) + { + if (_apostropheAnalyzer.IsApostropheOnly(quotationMark)) + return false; + + if (_quotationMarkSequences.IsMarkMuchMoreCommonLater(quotationMark)) + return true; + if ( + _quotationMarkSequences.AreEarlyAndLateMarkRatesSimilar(quotationMark) + && _quoteConventions.IsQuotationMarkDirectionAmbiguous(quotationMark) + ) + { + return true; + } + return false; + } + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkDirection.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkDirection.cs new file mode 100644 index 00000000..52d63b33 --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkDirection.cs @@ -0,0 +1,8 @@ +namespace SIL.Machine.PunctuationAnalysis +{ + public enum QuotationMarkDirection + { + Opening, + Closing + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs new file mode 100644 index 00000000..d5602ed3 --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs @@ -0,0 +1,54 @@ +using System.Collections.Generic; +using System.Linq; +using PCRE; + +namespace SIL.Machine.PunctuationAnalysis +{ + public class QuotationMarkFinder + { + private static readonly PcreRegex QuotationMarkPattern = new PcreRegex(@"(\p{Quotation_Mark}|<<|>>|<|>)"); + private readonly QuoteConventionSet _quoteConventions; + + public QuotationMarkFinder(QuoteConventionSet quoteConventions) + { + _quoteConventions = quoteConventions; + } + + public List FindAllPotentialQuotationMarksInChapter(Chapter chapter) + { + var quotationMatches = new List(); + foreach (Verse verse in chapter.Verses) + quotationMatches.AddRange(FindAllPotentialQuotationMarksInVerse(verse)); + return quotationMatches; + } + + public List FindAllPotentialQuotationMarksInVerse(Verse verse) + { + return FindAllPotentialQuotationMarksInTextSegments(verse.TextSegments); + } + + public virtual List FindAllPotentialQuotationMarksInTextSegments( + IReadOnlyList textSegments + ) + { + return textSegments.SelectMany(ts => FindAllPotentialQuotationMarksInTextSegment(ts)).ToList(); + } + + public List FindAllPotentialQuotationMarksInTextSegment(TextSegment textSegment) + { + return QuotationMarkPattern + .Matches(textSegment.Text) + .Cast() + .Where(match => + _quoteConventions.IsValidOpeningQuotationMark(match.Groups[0].Value) + || _quoteConventions.IsValidClosingQuotationMark(match.Groups[0].Value) + ) + .Select(m => new QuotationMarkStringMatch( + textSegment, + m.Groups[0].Index, + m.Groups[0].Index + m.Groups[0].Length + )) + .ToList(); + } + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkMetadata.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkMetadata.cs new file mode 100644 index 00000000..b6530b9d --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkMetadata.cs @@ -0,0 +1,84 @@ +using System; + +namespace SIL.Machine.PunctuationAnalysis +{ + public class QuotationMarkMetadata : IEquatable + { + public string QuotationMark { get; } + public int Depth { get; } + public QuotationMarkDirection Direction { get; } + public TextSegment TextSegment { get; } + public int StartIndex { get; private set; } + public int EndIndex { get; private set; } + + public QuotationMarkMetadata( + string quotationMark, + int depth, + QuotationMarkDirection direction, + TextSegment textSegment, + int startIndex, + int endIndex + ) + { + QuotationMark = quotationMark; + Depth = depth; + Direction = direction; + TextSegment = textSegment; + StartIndex = startIndex; + EndIndex = endIndex; + } + + public int Length => EndIndex - StartIndex; + + public void ShiftIndices(int shiftAmount) + { + StartIndex += shiftAmount; + EndIndex += shiftAmount; + } + + public override bool Equals(object obj) + { + if (!(obj is QuotationMarkMetadata other)) + { + return false; + } + return Equals(other); + } + + public bool Equals(QuotationMarkMetadata other) + { + return QuotationMark.Equals(other.QuotationMark) + && Depth.Equals(other.Depth) + && Direction.Equals(other.Direction) + && TextSegment.Equals(other.TextSegment) + && StartIndex.Equals(other.StartIndex) + && EndIndex.Equals(other.EndIndex); + } + + public override int GetHashCode() + { + int hashCode = 23; + hashCode = hashCode * 31 + QuotationMark.GetHashCode(); + hashCode = hashCode * 31 + Depth.GetHashCode(); + hashCode = hashCode * 31 + Direction.GetHashCode(); + hashCode = hashCode * 31 + TextSegment.GetHashCode(); + hashCode = hashCode * 31 + StartIndex.GetHashCode(); + hashCode = hashCode * 31 + EndIndex.GetHashCode(); + return hashCode; + } + + public void UpdateQuotationMark(QuoteConvention quoteConvention) + { + string updatedQuotationMark = quoteConvention.GetExpectedQuotationMark(Depth, Direction); + if (updatedQuotationMark.Equals(QuotationMark)) + return; + + TextSegment.ReplaceSubstring(StartIndex, EndIndex, updatedQuotationMark); + + if (updatedQuotationMark.Length != QuotationMark.Length) + { + EndIndex += updatedQuotationMark.Length - QuotationMark.Length; + } + } + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkResolutionIssue.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkResolutionIssue.cs new file mode 100644 index 00000000..233dc45c --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkResolutionIssue.cs @@ -0,0 +1,11 @@ +namespace SIL.Machine.PunctuationAnalysis +{ + public enum QuotationMarkResolutionIssue + { + UnpairedQuotationMark, + TooDeepNesting, + IncompatibleQuotationMark, + AmbiguousQuotationMark, + UnexpectedQuotationMark + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs new file mode 100644 index 00000000..1dffa148 --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs @@ -0,0 +1,190 @@ +using System; +using System.Globalization; +using System.Text.RegularExpressions; +using PCRE; + +namespace SIL.Machine.PunctuationAnalysis +{ + public class QuotationMarkStringMatch + { + private static readonly PcreRegex LetterPattern = new PcreRegex(@"[\p{L}\N{U+0001E200}-\N{U+0001E28F}]"); + private static readonly PcreRegex LatinLetterPattern = new PcreRegex(@"^\p{Script_Extensions=Latin}$"); + private static readonly Regex WhitespacePattern = new Regex(@"[\s~]", RegexOptions.Compiled); + private static readonly Regex PunctuationPattern = new Regex(@"[\.,;\?!\)\]\-—۔،؛]", RegexOptions.Compiled); + private static readonly Regex QuoteIntroducerPattern = new Regex(@"[:,]\s*$", RegexOptions.Compiled); + + public TextSegment TextSegment { get; } + public int StartIndex { get; } + public int EndIndex { get; } + + public QuotationMarkStringMatch(TextSegment textSegment, int startIndex, int endIndex) + { + TextSegment = textSegment; + StartIndex = startIndex; + EndIndex = endIndex; + } + + public override bool Equals(object obj) + { + if (!(obj is QuotationMarkStringMatch other)) + return false; + return TextSegment.Equals(other.TextSegment) + && StartIndex == other.StartIndex + && EndIndex == other.EndIndex; + } + + public override int GetHashCode() + { + int code = 23; + code = code * 31 + TextSegment.GetHashCode(); + code = code * 31 + StartIndex.GetHashCode(); + code = code * 31 + EndIndex.GetHashCode(); + return code; + } + + public string QuotationMark => + new StringInfo(TextSegment.Text).SubstringByTextElements(StartIndex, EndIndex - StartIndex); + + public bool IsValidOpeningQuotationMark(QuoteConventionSet quoteConventions) => + quoteConventions.IsValidOpeningQuotationMark(QuotationMark); + + public bool IsValidClosingQuotationMark(QuoteConventionSet quoteConventions) => + quoteConventions.IsValidClosingQuotationMark(QuotationMark); + + public bool QuotationMarkMatches(Regex regexPattern) => regexPattern.IsMatch(QuotationMark); + + public bool NextCharacterMatches(Regex regexPattern) => + NextCharacter != null && regexPattern.IsMatch(NextCharacter); + + public bool NextCharacterMatches(PcreRegex regexPattern) => + NextCharacter != null && regexPattern.IsMatch(NextCharacter); + + public bool PreviousCharacterMatches(Regex regexPattern) => + PreviousCharacter != null && regexPattern.IsMatch(PreviousCharacter); + + public bool PreviousCharacterMatches(PcreRegex regexPattern) => + PreviousCharacter != null && regexPattern.IsMatch(PreviousCharacter); + + public string PreviousCharacter + { + get + { + if (IsAtStartOfSegment) + { + TextSegment previousSegment = TextSegment.PreviousSegment; + if (previousSegment != null && !TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph)) + { + return new StringInfo(previousSegment.Text).SubstringByTextElements( + previousSegment.Text.Length - 1, + 1 + ); + } + return null; + } + return new StringInfo(TextSegment.Text).SubstringByTextElements(StartIndex - 1, 1); + } + } + + public string NextCharacter + { + get + { + if (IsAtEndOfSegment) + { + TextSegment nextSegment = TextSegment.NextSegment; + if (nextSegment != null && !TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph)) + { + return new StringInfo(nextSegment.Text).SubstringByTextElements(0, 1); + } + return null; + } + return new StringInfo(TextSegment.Text).SubstringByTextElements(EndIndex, 1); + } + } + + public bool LeadingSubstringMatches(Regex regexPattern) => + regexPattern.IsMatch(TextSegment.SubstringBefore(StartIndex)); + + public bool LeadingSubstringMatches(PcreRegex regexPattern) => + regexPattern.IsMatch(TextSegment.SubstringBefore(StartIndex)); + + public bool TrailingSubstringMatches(Regex regexPattern) => + regexPattern.IsMatch(TextSegment.SubstringAfter(EndIndex)); + + public bool TrailingSubstringMatches(PcreRegex regexPattern) => + regexPattern.IsMatch(TextSegment.SubstringAfter(EndIndex)); + + // This assumes that the two matches occur in the same verse + public bool Precedes(QuotationMarkStringMatch other) + { + return TextSegment.IndexInVerse < other.TextSegment.IndexInVerse + || (TextSegment.IndexInVerse == other.TextSegment.IndexInVerse && StartIndex < other.StartIndex); + } + + // Not used, but a useful method for debugging + public string Context() + { + int contextStartIndex = Math.Max(StartIndex - 10, 0); + int contextEndIndex = Math.Min(EndIndex + 10, TextSegment.Length); + return TextSegment.Text.Substring(contextStartIndex, contextEndIndex - contextStartIndex); + } + + public QuotationMarkMetadata Resolve(int depth, QuotationMarkDirection direction) => + new QuotationMarkMetadata(QuotationMark, depth, direction, TextSegment, StartIndex, EndIndex); + + public bool IsAtStartOfSegment => StartIndex == 0; + + public bool IsAtEndOfSegment => EndIndex == TextSegment.Length; + + public bool HasLeadingWhitespace() + { + if (PreviousCharacter == null) + { + return TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph) + || TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Embed) + || TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse); + } + return PreviousCharacterMatches(WhitespacePattern); + } + + public bool HasTrailingWhitespace() + { + return NextCharacterMatches(WhitespacePattern); + } + + public bool HasLeadingPunctuation() + { + return PreviousCharacterMatches(PunctuationPattern); + } + + public bool HasTrailingPunctuation() + { + return NextCharacterMatches(PunctuationPattern); + } + + public bool HasLetterInLeadingSubstring() + { + return LeadingSubstringMatches(LetterPattern); + } + + public bool HasLetterInTrailingSubstring() + { + return TrailingSubstringMatches(LetterPattern); + } + + public bool HasLeadingLatinLetter() + { + return PreviousCharacterMatches(LatinLetterPattern); + } + + public bool HasTrailingLatinLetter() + { + return NextCharacterMatches(LatinLetterPattern); + } + + public bool HasQuoteIntroducerInLeadingSubstring() + { + return LeadingSubstringMatches(QuoteIntroducerPattern); + } + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs new file mode 100644 index 00000000..e12a2054 --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs @@ -0,0 +1,139 @@ +using System; +using System.Collections.Generic; +using System.Text; +using SIL.Extensions; + +namespace SIL.Machine.PunctuationAnalysis +{ + public class QuotationMarkCounts + { + private readonly Dictionary _quotationMarkCounter; + + public int TotalCount { get; private set; } + + public QuotationMarkCounts() + { + _quotationMarkCounter = new Dictionary(); + TotalCount = 0; + } + + public void CountQuotationMark(string quotationMark) + { + _quotationMarkCounter.UpdateValue(quotationMark, () => 0, i => i + 1); + TotalCount++; + } + + public (string BestString, int BestStringCount, int TotalStringCount) FindBestQuotationMarkProportion() + { + string bestString = _quotationMarkCounter.MaxBy(kvp => kvp.Value).Key; + return (bestString, _quotationMarkCounter[bestString], TotalCount); + } + + public int CalculateNumDifferences(string expectedQuotationMark) + { + if (!_quotationMarkCounter.TryGetValue(expectedQuotationMark, out int count)) + { + return TotalCount; + } + return TotalCount - count; + } + } + + public class QuotationMarkTabulator + { + private readonly Dictionary< + (int Depth, QuotationMarkDirection Direction), + QuotationMarkCounts + > _quotationCountsByDepthAndDirection; + + public QuotationMarkTabulator() + { + _quotationCountsByDepthAndDirection = + new Dictionary<(int Depth, QuotationMarkDirection Direction), QuotationMarkCounts>(); + } + + public void Tabulate(List quotationMarks) + { + foreach (QuotationMarkMetadata quotationMark in quotationMarks) + { + CountQuotationMark(quotationMark); + } + } + + private void CountQuotationMark(QuotationMarkMetadata quote) + { + (int Depth, QuotationMarkDirection Direction) key = (quote.Depth, quote.Direction); + string quotationMark = quote.QuotationMark; + _quotationCountsByDepthAndDirection.UpdateValue( + key, + () => new QuotationMarkCounts(), + counts => + { + counts.CountQuotationMark(quotationMark); + return counts; + } + ); + } + + public double CalculateSimilarity(QuoteConvention quoteConvention) + { + double weightedDifference = 0.0; + double totalWeight = 0.0; + foreach ((int depth, QuotationMarkDirection direction) in _quotationCountsByDepthAndDirection.Keys) + { + string expectedQuotationMark = quoteConvention.GetExpectedQuotationMark(depth, direction); + + // Give higher weight to shallower depths, since deeper marks are more likely to be mistakes + weightedDifference += ( + _quotationCountsByDepthAndDirection[(depth, direction)] + .CalculateNumDifferences(expectedQuotationMark) * Math.Pow(2, -depth) + ); + totalWeight += _quotationCountsByDepthAndDirection[(depth, direction)].TotalCount * Math.Pow(2, -depth); + } + if (totalWeight == 0.0) + { + return 0.0; + } + return 1 - (weightedDifference / totalWeight); + } + + private bool DepthAndDirectionObserved(int depth, QuotationMarkDirection direction) + { + return _quotationCountsByDepthAndDirection.ContainsKey((depth, direction)); + } + + private ( + string openingQuotationMark, + int observedOpeningCount, + int totalOpeningCount + ) FindMostCommonQuotationMarkWithDepthAndDirection(int depth, QuotationMarkDirection direction) + { + return _quotationCountsByDepthAndDirection.TryGetValue((depth, direction), out QuotationMarkCounts counts) + ? counts.FindBestQuotationMarkProportion() + : (null, 0, 0); + } + + public string GetSummaryMessage() + { + var message = new StringBuilder(); + for (int depth = 1; depth < 5; depth++) + { + (string openingQuotationMark, int observedOpeningCount, int totalOpeningCount) = + FindMostCommonQuotationMarkWithDepthAndDirection(depth, QuotationMarkDirection.Opening); + (string closingQuotationMark, int observedClosingCount, int totalClosingCount) = + FindMostCommonQuotationMarkWithDepthAndDirection(depth, QuotationMarkDirection.Closing); + + if ( + DepthAndDirectionObserved(depth, QuotationMarkDirection.Opening) + && DepthAndDirectionObserved(depth, QuotationMarkDirection.Closing) + ) + { + message.AppendLine( + $"The most common level {depth} quotation marks are {openingQuotationMark} ({observedOpeningCount} of {totalOpeningCount} opening marks) and {closingQuotationMark} ({observedClosingCount} of {totalClosingCount} closing marks)" + ); + } + } + return message.ToString(); + } + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/QuoteConvention.cs b/src/SIL.Machine/PunctuationAnalysis/QuoteConvention.cs new file mode 100644 index 00000000..c8e17e85 --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/QuoteConvention.cs @@ -0,0 +1,154 @@ +using System.Collections.Generic; +using System.Linq; + +namespace SIL.Machine.PunctuationAnalysis +{ + public class SingleLevelQuoteConvention + { + public static readonly IReadOnlyDictionary QuoteNormalizationMap = new Dictionary() + { + { "\u00ab", '\"' }, + { "\u00bb", '"' }, + { "\u2018", '\'' }, + { "\u2019", '\'' }, + { "\u201a", '\'' }, + { "\u201c", '"' }, + { "\u201d", '"' }, + { "\u201e", '"' }, + { "\u300a", '"' }, + { "\u300b", '"' }, + { "\u300c", '"' }, + { "\u300d", '"' } + }; + public string OpeningQuotationMark { get; } + public string ClosingQuotationMark { get; } + + public SingleLevelQuoteConvention(string openingQuotationMark, string closingQuotationMark) + { + OpeningQuotationMark = openingQuotationMark; + ClosingQuotationMark = closingQuotationMark; + } + + public SingleLevelQuoteConvention Normalize() + { + string normalizedOpeningQuotationMark = QuoteNormalizationMap.TryGetValue( + OpeningQuotationMark, + out char quote + ) + ? quote.ToString() + : OpeningQuotationMark; + string normalizedClosingQuotationMark = QuoteNormalizationMap.TryGetValue(ClosingQuotationMark, out quote) + ? quote.ToString() + : ClosingQuotationMark; + return new SingleLevelQuoteConvention(normalizedOpeningQuotationMark, normalizedClosingQuotationMark); + } + } + + public class QuoteConvention + { + public string Name { get; } + + public IReadOnlyList LevelConventions { get; } + + public QuoteConvention(string name, List levels) + { + Name = name; + LevelConventions = levels; + } + + public int NumLevels => LevelConventions.Count; + + public string GetOpeningQuotationMarkAtDepth(int depth) + { + return LevelConventions[depth - 1].OpeningQuotationMark; + } + + public string GetClosingQuotationMarkAtDepth(int depth) + { + return LevelConventions[depth - 1].ClosingQuotationMark; + } + + public string GetExpectedQuotationMark(int depth, QuotationMarkDirection direction) + { + if (depth > NumLevels || depth < 1) + return ""; + return direction == QuotationMarkDirection.Opening + ? GetOpeningQuotationMarkAtDepth(depth) + : GetClosingQuotationMarkAtDepth(depth); + } + + public bool IncludesOpeningQuotationMark(string openingQuotationMark) + { + foreach (SingleLevelQuoteConvention level in LevelConventions) + { + if (level.OpeningQuotationMark == openingQuotationMark) + return true; + } + return false; + } + + public bool IncludesClosingQuotationMark(string closingQuotationMark) + { + foreach (SingleLevelQuoteConvention level in LevelConventions) + { + if (level.ClosingQuotationMark == closingQuotationMark) + return true; + } + return false; + } + + public HashSet GetPossibleDepths(string quotationMark, QuotationMarkDirection direction) + { + var depths = new HashSet(); + foreach ( + (int depth, SingleLevelQuoteConvention levelConvention) in LevelConventions.Select((l, i) => (i + 1, l)) + ) + { + if ( + direction == QuotationMarkDirection.Opening + && levelConvention.OpeningQuotationMark == quotationMark + ) + { + depths.Add(depth); + } + else if ( + direction == QuotationMarkDirection.Closing + && levelConvention.ClosingQuotationMark == quotationMark + ) + { + depths.Add(depth); + } + } + return depths; + } + + public bool IsCompatibleWithObservedQuotationMarks( + List openingQuotationMarks, + List closingQuotationMarks + ) + { + foreach (string openingQuotationMark in openingQuotationMarks) + { + if (!IncludesOpeningQuotationMark(openingQuotationMark)) + return false; + } + foreach (string closingQuotationMark in closingQuotationMarks) + { + if (!IncludesClosingQuotationMark(closingQuotationMark)) + return false; + } + + // we require the first-level quotes to have been observed + if (!openingQuotationMarks.Contains(GetOpeningQuotationMarkAtDepth(1))) + return false; + if (!closingQuotationMarks.Contains(GetClosingQuotationMarkAtDepth(1))) + return false; + return true; + } + + public QuoteConvention Normalize() + { + return new QuoteConvention(Name + "_normalized", LevelConventions.Select(l => l.Normalize()).ToList()); + } + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetectionResolutionSettings.cs b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetectionResolutionSettings.cs new file mode 100644 index 00000000..f030b4cc --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetectionResolutionSettings.cs @@ -0,0 +1,55 @@ +using System.Collections.Generic; +using System.Text.RegularExpressions; + +namespace SIL.Machine.PunctuationAnalysis +{ + public class QuoteConventionDetectionResolutionSettings : IQuotationMarkResolutionSettings + { + private readonly QuoteConventionSet _quoteConventions; + + public QuoteConventionDetectionResolutionSettings(QuoteConventionSet quoteConventions) + { + _quoteConventions = quoteConventions; + } + + public bool AreMarksAValidPair(string openingMark, string closingMark) + { + return _quoteConventions.MarksAreAValidPair(openingMark, closingMark); + } + + public Regex GetClosingQuotationMarkRegex() + { + return _quoteConventions.ClosingQuotationMarkRegex; + } + + public Regex GetOpeningQuotationMarkRegex() + { + return _quoteConventions.OpeningQuotationMarkRegex; + } + + public HashSet GetPossibleDepths(string quotationMark, QuotationMarkDirection direction) + { + return _quoteConventions.GetPossibleDepths(quotationMark, direction); + } + + public bool IsValidClosingQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + return quotationMarkMatch.IsValidClosingQuotationMark(_quoteConventions); + } + + public bool IsValidOpeningQuotationMark(QuotationMarkStringMatch quotationMarkMatch) + { + return quotationMarkMatch.IsValidOpeningQuotationMark(_quoteConventions); + } + + public bool MetadataMatchesQuotationMark(string quotationMark, int depth, QuotationMarkDirection direction) + { + return _quoteConventions.MetadataMatchesQuotationMark(quotationMark, depth, direction); + } + + public bool ShouldRelyOnParagraphMarkers() + { + return true; + } + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs new file mode 100644 index 00000000..bd6c7fea --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs @@ -0,0 +1,78 @@ +using System.Collections.Generic; +using System.Linq; + +namespace SIL.Machine.PunctuationAnalysis +{ + public class QuoteConventionAnalysis + { + public QuoteConvention BestQuoteConvention { get; private set; } + public double BestQuoteConventionScore { get; private set; } + public string AnalysisSummary { get; private set; } + + public QuoteConventionAnalysis( + QuoteConvention bestQuoteConvention, + double bestQuoteConventionScore, + string analysisSummary + ) + { + BestQuoteConvention = bestQuoteConvention; + BestQuoteConventionScore = bestQuoteConventionScore; + AnalysisSummary = analysisSummary; + } + } + + public class QuoteConventionDetector : UsfmStructureExtractor + { + private readonly QuotationMarkTabulator _quotationMarkTabulator; + + public QuoteConventionDetector() + : base() + { + _quotationMarkTabulator = new QuotationMarkTabulator(); + } + + private void CountQuotationMarksInChapters(List chapters) + { + QuoteConventionSet possibleQuoteConventions = new PreliminaryQuotationMarkAnalyzer( + QuoteConventions.Standard + ).NarrowDownPossibleQuoteConventions(chapters); + + foreach (Chapter chapter in chapters) + CountQuotationMarksInChapter(chapter, possibleQuoteConventions); + } + + private void CountQuotationMarksInChapter(Chapter chapter, QuoteConventionSet possibleQuoteConventions) + { + List quotationMarkMatches = new QuotationMarkFinder( + possibleQuoteConventions + ).FindAllPotentialQuotationMarksInChapter(chapter); + + List resolvedQuotationMarks = new DepthBasedQuotationMarkResolver( + new QuoteConventionDetectionResolutionSettings(possibleQuoteConventions) + ) + .ResolveQuotationMarks(quotationMarkMatches) + .ToList(); + + _quotationMarkTabulator.Tabulate(resolvedQuotationMarks); + } + + public QuoteConventionAnalysis DetectQuotationConvention() + { + CountQuotationMarksInChapters(GetChapters()); + + (QuoteConvention bestQuoteConvention, double score) = QuoteConventions.Standard.FindMostSimilarConvention( + _quotationMarkTabulator + ); + + if (score > 0 && bestQuoteConvention != null) + { + return new QuoteConventionAnalysis( + bestQuoteConvention, + score, + _quotationMarkTabulator.GetSummaryMessage() + ); + } + return null; + } + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/QuoteConventionSet.cs b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionSet.cs new file mode 100644 index 00000000..f208df92 --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionSet.cs @@ -0,0 +1,234 @@ +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using System.Text.RegularExpressions; +using SIL.Extensions; + +namespace SIL.Machine.PunctuationAnalysis +{ + public class QuoteConventionSet : IEquatable + { + public IReadOnlyList Conventions { get; } + + public Regex OpeningQuotationMarkRegex { get; private set; } + public Regex ClosingQuotationMarkRegex { get; private set; } + public Regex AllQuotationMarkRegex { get; private set; } + + public IReadOnlyDictionary> ClosingMarksByOpeningMark { get; private set; } + public IReadOnlyDictionary> OpeningMarksByClosingMark { get; private set; } + + public QuoteConventionSet(List conventions) + { + Conventions = conventions; + CreateQuotationMarkRegexes(); + CreateQuotationMarkPairMap(); + } + + public override bool Equals(object obj) + { + if (!(obj is QuoteConventionSet other)) + return false; + return Equals(other); + } + + public bool Equals(QuoteConventionSet other) + { + return Conventions.SequenceEqual(other.Conventions); + } + + public override int GetHashCode() + { + int hashCode = 23; + return hashCode * 31 + Conventions.GetHashCode(); + } + + private void CreateQuotationMarkRegexes() + { + OpeningQuotationMarkRegex = new Regex(@"", RegexOptions.Compiled); + ClosingQuotationMarkRegex = new Regex(@"", RegexOptions.Compiled); + AllQuotationMarkRegex = new Regex(@"", RegexOptions.Compiled); + + var openingQuotationMarks = new HashSet(); + var closingQuotationMarks = new HashSet(); + + foreach (QuoteConvention convention in Conventions) + { + for (int level = 1; level < convention.NumLevels + 1; level++) + { + string openingQuote = convention.GetOpeningQuotationMarkAtDepth(level); + string closingQuote = convention.GetClosingQuotationMarkAtDepth(level); + openingQuotationMarks.Add(openingQuote); + closingQuotationMarks.Add(closingQuote); + } + } + + var allQuotationMarks = openingQuotationMarks.Union(closingQuotationMarks).ToImmutableHashSet(); + + if (allQuotationMarks.Count > 0) + { + OpeningQuotationMarkRegex = new Regex( + @"[" + string.Join("", openingQuotationMarks.OrderBy(q => q)) + "]", + RegexOptions.Compiled + ); + ClosingQuotationMarkRegex = new Regex( + @"[" + string.Join("", closingQuotationMarks.OrderBy(q => q)) + "]", + RegexOptions.Compiled + ); + AllQuotationMarkRegex = new Regex( + @"[" + string.Join("", allQuotationMarks.OrderBy(q => q)) + "]", + RegexOptions.Compiled + ); + } + } + + private void CreateQuotationMarkPairMap() + { + var closingMarksByOpeningMark = new Dictionary>(); + var openingMarksByClosingMark = new Dictionary>(); + foreach (QuoteConvention convention in Conventions) + { + for (int level = 1; level < convention.NumLevels + 1; level++) + { + string openingQuote = convention.GetOpeningQuotationMarkAtDepth(level); + string closingQuote = convention.GetClosingQuotationMarkAtDepth(level); + closingMarksByOpeningMark.UpdateValue( + openingQuote, + () => new HashSet(), + set => + { + set.Add(closingQuote); + return set; + } + ); + openingMarksByClosingMark.UpdateValue( + closingQuote, + () => new HashSet(), + set => + { + set.Add(openingQuote); + return set; + } + ); + } + } + ClosingMarksByOpeningMark = closingMarksByOpeningMark; + OpeningMarksByClosingMark = openingMarksByClosingMark; + } + + public QuoteConvention GetQuoteConventionByName(string name) + { + foreach (QuoteConvention convention in Conventions) + { + if (convention.Name == name) + { + return convention; + } + } + return null; + } + + public IReadOnlyList GetAllQuoteConventionNames() + { + return Conventions.Select(c => c.Name).OrderBy(c => c).ToList(); + } + + public IReadOnlyList GetPossibleOpeningQuotationMarks() + { + return ClosingMarksByOpeningMark.Keys.OrderBy(k => k).ToList(); + } + + public IReadOnlyList GetPossibleClosingQuotationMarks() + { + return OpeningMarksByClosingMark.Keys.OrderBy(k => k).ToList(); + } + + public bool IsValidOpeningQuotationMark(string quotationMark) + { + return ClosingMarksByOpeningMark.ContainsKey(quotationMark); + } + + public bool IsValidClosingQuotationMark(string quotationMark) + { + return OpeningMarksByClosingMark.ContainsKey(quotationMark); + } + + public bool MarksAreAValidPair(string openingMark, string closingMark) + { + return ClosingMarksByOpeningMark.TryGetValue(openingMark, out HashSet set) + && set.Contains(closingMark); + } + + public bool IsQuotationMarkDirectionAmbiguous(string quotationMark) + { + return ( + ClosingMarksByOpeningMark.TryGetValue(quotationMark, out HashSet closingMarks) + && closingMarks.Contains(quotationMark) + ); + } + + public HashSet GetPossiblePairedQuotationMarks(string quotationMark) + { + var pairedQuotationMarks = new HashSet(); + if (ClosingMarksByOpeningMark.TryGetValue(quotationMark, out HashSet set)) + { + pairedQuotationMarks.AddRange(set); + } + if (OpeningMarksByClosingMark.TryGetValue(quotationMark, out set)) + { + pairedQuotationMarks.AddRange(set); + } + return pairedQuotationMarks; + } + + public HashSet GetPossibleDepths(string quotationMark, QuotationMarkDirection direction) + { + var depths = new HashSet(); + foreach (QuoteConvention convention in Conventions) + { + depths.AddRange(convention.GetPossibleDepths(quotationMark, direction)); + } + return depths; + } + + public bool MetadataMatchesQuotationMark(string quotationMark, int depth, QuotationMarkDirection direction) + { + foreach (QuoteConvention convention in Conventions) + { + if (convention.GetExpectedQuotationMark(depth, direction) == quotationMark) + return true; + } + return false; + } + + public QuoteConventionSet FilterToCompatibleQuoteConventions( + List openingQuotationMarks, + List closingQuotationMarks + ) + { + return new QuoteConventionSet( + Conventions + .Where(c => c.IsCompatibleWithObservedQuotationMarks(openingQuotationMarks, closingQuotationMarks)) + .ToList() + ); + } + + public (QuoteConvention Convention, double Similarity) FindMostSimilarConvention( + QuotationMarkTabulator tabulatedQuotationMarks + ) + { + double bestSimilarity = double.MinValue; + QuoteConvention bestQuoteConvention = null; + foreach (QuoteConvention quoteConvention in Conventions) + { + double similarity = tabulatedQuotationMarks.CalculateSimilarity(quoteConvention); + if (similarity > bestSimilarity) + { + bestSimilarity = similarity; + bestQuoteConvention = quoteConvention; + } + } + return (bestQuoteConvention, bestSimilarity); + } + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs b/src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs new file mode 100644 index 00000000..5720198e --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs @@ -0,0 +1,221 @@ +using System.Collections.Generic; + +namespace SIL.Machine.PunctuationAnalysis +{ + public static class QuoteConventions + { + public static readonly QuoteConventionSet Standard = new QuoteConventionSet( + new List + { + new QuoteConvention( + "standard_english", + new List + { + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + } + ), + new QuoteConvention( + "typewriter_english", + new List + { + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + } + ), + new QuoteConvention( + "british_english", + new List + { + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + } + ), + new QuoteConvention( + "british_typewriter_english", + new List + { + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + } + ), + new QuoteConvention( + "hybrid_typewriter_english", + new List + { + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + } + ), + new QuoteConvention( + "standard_french", + new List + { + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + } + ), + new QuoteConvention( + "typewriter_french", + new List + { + new SingleLevelQuoteConvention("<<", ">>"), + new SingleLevelQuoteConvention("<", ">"), + new SingleLevelQuoteConvention("<<", ">>"), + new SingleLevelQuoteConvention("<", ">"), + } + ), + new QuoteConvention( + "french_variant", + new List + { + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + } + ), + new QuoteConvention( + "western_european", + new List + { + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + } + ), + new QuoteConvention( + "british_inspired_western_european", + new List + { + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + } + ), + new QuoteConvention( + "typewriter_western_european", + new List + { + new SingleLevelQuoteConvention("<<", ">>"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + } + ), + new QuoteConvention( + "typewriter_western_european_variant", + new List + { + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("<", ">"), + new SingleLevelQuoteConvention("'", "'"), + } + ), + new QuoteConvention( + "hybrid_typewriter_western_european", + new List + { + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + } + ), + new QuoteConvention( + "hybrid_british_typewriter_western_european", + new List + { + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + } + ), + new QuoteConvention( + "central_european", + new List + { + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + } + ), + new QuoteConvention( + "central_european_guillemets", + new List + { + new SingleLevelQuoteConvention("\u00bb", "\u00ab"), + new SingleLevelQuoteConvention("\u203a", "\u2039"), + new SingleLevelQuoteConvention("\u00bb", "\u00ab"), + new SingleLevelQuoteConvention("\u203a", "\u2039"), + } + ), + new QuoteConvention( + "standard_swedish", + new List + { + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + } + ), + new QuoteConvention( + "standard_finnish", + new List + { + new SingleLevelQuoteConvention("\u00bb", "\u00bb"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + } + ), + new QuoteConvention( + "eastern_european", + new List + { + new SingleLevelQuoteConvention("\u201e", "\u201d"), + new SingleLevelQuoteConvention("\u201a", "\u2019"), + new SingleLevelQuoteConvention("\u201e", "\u201d"), + new SingleLevelQuoteConvention("\u201a", "\u2019"), + } + ), + new QuoteConvention( + "standard_russian", + new List + { + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + } + ), + new QuoteConvention( + "standard_arabic", + new List + { + new SingleLevelQuoteConvention("\u201d", "\u201c"), + new SingleLevelQuoteConvention("\u2019", "\u2018"), + new SingleLevelQuoteConvention("\u201d", "\u201c"), + new SingleLevelQuoteConvention("\u2019", "\u2018"), + } + ), + new QuoteConvention( + "non-standard_arabic", + new List + { + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2019", "\u2018"), + } + ), + } + ); + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs new file mode 100644 index 00000000..f2f783fb --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs @@ -0,0 +1,149 @@ +using System; +using System.Collections.Generic; +using SIL.Machine.Corpora; + +namespace SIL.Machine.PunctuationAnalysis +{ + public class TextSegment : IEquatable + { + public string Text { get; private set; } + public UsfmMarkerType ImmediatePrecedingMarker { get; private set; } + public HashSet MarkersInPrecedingContext { get; private set; } + public TextSegment PreviousSegment { get; set; } + public TextSegment NextSegment { get; set; } + public int IndexInVerse { get; set; } + public int NumSegmentsInVerse { get; set; } + public UsfmToken UsfmToken { get; private set; } + + public TextSegment() + { + Text = ""; + ImmediatePrecedingMarker = UsfmMarkerType.NoMarker; + MarkersInPrecedingContext = new HashSet(); + PreviousSegment = null; + NextSegment = null; + IndexInVerse = 0; + NumSegmentsInVerse = 0; + UsfmToken = null; + } + + public TextSegment(string text) + { + Text = text; + ImmediatePrecedingMarker = UsfmMarkerType.NoMarker; + MarkersInPrecedingContext = new HashSet(); + PreviousSegment = null; + NextSegment = null; + IndexInVerse = 0; + NumSegmentsInVerse = 0; + UsfmToken = null; + } + + public override bool Equals(object obj) + { + if (!(obj is TextSegment other)) + { + return false; + } + return Equals(other); + } + + public bool Equals(TextSegment other) + { + return Text.Equals(other.Text) + && IndexInVerse.Equals(other.IndexInVerse) + && NumSegmentsInVerse.Equals(other.NumSegmentsInVerse) + && ( + (UsfmToken == null && other.UsfmToken == null) + || (UsfmToken != null && other.UsfmToken != null && UsfmToken.Equals(other.UsfmToken)) + ) + && ImmediatePrecedingMarker.Equals(other.ImmediatePrecedingMarker); + } + + public override int GetHashCode() + { + int hashCode = 23; + hashCode = hashCode * 31 + Text.GetHashCode(); + hashCode = hashCode * 31 + IndexInVerse.GetHashCode(); + hashCode = hashCode * 31 + NumSegmentsInVerse.GetHashCode(); + hashCode = hashCode * 31 + UsfmToken.GetHashCode(); + return hashCode * 31 + ImmediatePrecedingMarker.GetHashCode(); + } + + public int Length => Text.Length; + + public string SubstringBefore(int index) + { + return Text.Substring(0, index); + } + + public string SubstringAfter(int index) + { + return Text.Substring(index); + } + + public bool MarkerIsInPrecedingContext(UsfmMarkerType marker) + { + return MarkersInPrecedingContext.Contains(marker); + } + + public bool IsFirstSegmentInVerse() + { + return IndexInVerse == 0; + } + + public bool IsLastSegmentInVerse() + { + return IndexInVerse == NumSegmentsInVerse - 1; + } + + public void ReplaceSubstring(int startIndex, int endIndex, string replacement) + { + Text = SubstringBefore(startIndex) + replacement + SubstringAfter(endIndex); + if (UsfmToken != null) + { + UsfmToken.Text = Text; + } + } + + public class Builder + { + private readonly TextSegment _textSegment; + + public Builder() + { + _textSegment = new TextSegment(); + } + + public Builder SetPreviousSegment(TextSegment previousSegment) + { + _textSegment.PreviousSegment = previousSegment; + return this; + } + + public Builder AddPrecedingMarker(UsfmMarkerType marker) + { + _textSegment.ImmediatePrecedingMarker = marker; + _textSegment.MarkersInPrecedingContext.Add(marker); + return this; + } + + public Builder SetUsfmToken(UsfmToken token) + { + _textSegment.UsfmToken = token; + return this; + } + + public Builder SetText(string text) + { + _textSegment.Text = text; + return this; + } + + public TextSegment Build() + { + return _textSegment; + } + } + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/UsfmMarkerType.cs b/src/SIL.Machine/PunctuationAnalysis/UsfmMarkerType.cs new file mode 100644 index 00000000..5e61d470 --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/UsfmMarkerType.cs @@ -0,0 +1,13 @@ +namespace SIL.Machine.PunctuationAnalysis +{ + public enum UsfmMarkerType + { + Paragraph, + Character, + Verse, + Chapter, + Embed, + Other, + NoMarker, + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/UsfmStructureExtractor.cs b/src/SIL.Machine/PunctuationAnalysis/UsfmStructureExtractor.cs new file mode 100644 index 00000000..ce2d6cd7 --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/UsfmStructureExtractor.cs @@ -0,0 +1,166 @@ +using System.Collections.Generic; +using SIL.Machine.Corpora; + +namespace SIL.Machine.PunctuationAnalysis +{ + public class UsfmStructureExtractor : IUsfmParserHandler + { + private readonly List _textSegments; + private TextSegment.Builder _nextTextSegmentBuilder; + + public UsfmStructureExtractor() + { + _textSegments = new List(); + _nextTextSegmentBuilder = new TextSegment.Builder(); + } + + public void Chapter(UsfmParserState state, string number, string marker, string altNumber, string pubNumber) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Chapter); + } + + public void EndBook(UsfmParserState state, string marker) { } + + public void EndCell(UsfmParserState state, string marker) { } + + public void EndChar(UsfmParserState state, string marker, IReadOnlyList attributes, bool closed) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Character); + } + + public void EndNote(UsfmParserState state, string marker, bool closed) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed); + } + + public void EndPara(UsfmParserState state, string marker) { } + + public void EndRow(UsfmParserState state, string marker) { } + + public void EndSidebar(UsfmParserState state, string marker, bool closed) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed); + } + + public void EndTable(UsfmParserState state) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed); + } + + public void EndUsfm(UsfmParserState state) { } + + public void GotMarker(UsfmParserState state, string marker) { } + + public void Milestone( + UsfmParserState state, + string marker, + bool startMilestone, + IReadOnlyList attributes + ) { } + + public void OptBreak(UsfmParserState state) { } + + public void Ref(UsfmParserState state, string marker, string display, string target) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed); + } + + public void StartBook(UsfmParserState state, string marker, string code) { } + + public void StartCell(UsfmParserState state, string marker, string align, int colspan) { } + + public void StartChar( + UsfmParserState state, + string markerWithoutPlus, + bool unknown, + IReadOnlyList attributes + ) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Character); + } + + public void StartNote(UsfmParserState state, string marker, string caller, string category) { } + + public void StartPara( + UsfmParserState state, + string marker, + bool unknown, + IReadOnlyList attributes + ) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Paragraph); + } + + public void StartRow(UsfmParserState state, string marker) { } + + public void StartSidebar(UsfmParserState state, string marker, string category) { } + + public void StartTable(UsfmParserState state) { } + + public void StartUsfm(UsfmParserState state) { } + + public void Text(UsfmParserState state, string text) + { + if (!state.IsVerseText) + return; + if (text.Length > 0) + { + _nextTextSegmentBuilder.SetText(text); + TextSegment textSegment = _nextTextSegmentBuilder.Build(); + // Don't look past verse boundaries, to enable identical functionality in the + // online one-verse-at-a-time (QuotationMarkDenormalizationScriptureUpdateBlockHandler) + // and offline whole-book-at-once settings (QuoteConventionDetector) + if (_textSegments.Count > 0 && !textSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)) + { + _textSegments[_textSegments.Count - 1].NextSegment = textSegment; + textSegment.PreviousSegment = _textSegments[_textSegments.Count - 1]; + } + _textSegments.Add(textSegment); + } + _nextTextSegmentBuilder = new TextSegment.Builder(); + } + + public void Unmatched(UsfmParserState state, string marker) { } + + public void Verse(UsfmParserState state, string number, string marker, string altNumber, string pubNumber) + { + _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Verse); + } + + public List GetChapters() + { + var chapters = new List(); + var currentChapterVerses = new List(); + var currentVerseSegments = new List(); + foreach (TextSegment textSegment in _textSegments) + { + if (textSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)) + { + if (currentVerseSegments.Count > 0) + { + currentChapterVerses.Add(new Verse(currentVerseSegments)); + } + currentVerseSegments = new List(); + } + if (textSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Chapter)) + { + if (currentChapterVerses.Count > 0) + { + chapters.Add(new Chapter(currentChapterVerses)); + } + currentChapterVerses = new List(); + } + currentVerseSegments.Add(textSegment); + } + if (currentVerseSegments.Count > 0) + { + currentChapterVerses.Add(new Verse(currentVerseSegments)); + } + if (currentChapterVerses.Count > 0) + { + chapters.Add(new Chapter(currentChapterVerses)); + } + return chapters; + } + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/Verse.cs b/src/SIL.Machine/PunctuationAnalysis/Verse.cs new file mode 100644 index 00000000..2f5364eb --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/Verse.cs @@ -0,0 +1,25 @@ +using System.Collections.Generic; +using System.Linq; + +namespace SIL.Machine.PunctuationAnalysis +{ + public class Verse + { + public IReadOnlyList TextSegments { get; private set; } + + public Verse(List textSegments) + { + TextSegments = textSegments; + IndexTextSegments(); + } + + private void IndexTextSegments() + { + foreach ((int index, TextSegment textSegment) in TextSegments.Select((t, i) => (i, t))) + { + textSegment.IndexInVerse = index; + textSegment.NumSegmentsInVerse = TextSegments.Count; + } + } + } +} diff --git a/src/SIL.Machine/SIL.Machine.csproj b/src/SIL.Machine/SIL.Machine.csproj index 6a7cfbcd..0307d374 100644 --- a/src/SIL.Machine/SIL.Machine.csproj +++ b/src/SIL.Machine/SIL.Machine.csproj @@ -38,6 +38,7 @@ + diff --git a/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs b/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs new file mode 100644 index 00000000..09462018 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/FallbackQuotationMarkResolverTests.cs @@ -0,0 +1,452 @@ +using NUnit.Framework; +using SIL.Machine.PunctuationAnalysis; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class FallbackQuotationMarkResolverTests +{ + [Test] + public void Reset() + { + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + Assert.IsNotNull(englishQuoteConvention); + + var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( + new QuotationMarkUpdateResolutionSettings(englishQuoteConvention) + ); + + basicQuotationMarkResolver.LastQuotationMark = new QuotationMarkMetadata( + "\"", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().SetText("\"'test text\"").Build(), + 0, + 1 + ); + basicQuotationMarkResolver.Issues.Add(QuotationMarkResolutionIssue.UnexpectedQuotationMark); + + basicQuotationMarkResolver.Reset(); + Assert.IsNull(basicQuotationMarkResolver.LastQuotationMark); + Assert.That(basicQuotationMarkResolver.Issues.Count, Is.EqualTo(0)); + } + + [Test] + public void SimpleQuotationMarkResolutionWithNoPreviousMark() + { + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + Assert.IsNotNull(englishQuoteConvention); + + var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( + new QuotationMarkUpdateResolutionSettings(englishQuoteConvention.Normalize()) + ); + + var actualResolvedQuotationMarks = basicQuotationMarkResolver + .ResolveQuotationMarks( + [new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test \" text").Build(), 5, 6),] + ) + .ToList(); + List expectedResolvedQuotationMarks = + [ + new QuotationMarkMetadata( + "\"", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().SetText("test \" text").Build(), + 5, + 6 + ) + ]; + + AssertResolvedQuotationMarksEqual(actualResolvedQuotationMarks, expectedResolvedQuotationMarks); + } + + [Test] + public void SimpleQuotationMarkResolutionWithPreviousOpeningMark() + { + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + Assert.IsNotNull(englishQuoteConvention); + + var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( + new QuotationMarkUpdateResolutionSettings(englishQuoteConvention.Normalize()) + ); + + var actualResolvedQuotationMarks = basicQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"test \" text").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"test \" text").Build(), 6, 7), + ] + ) + .ToList(); + List expectedResolvedQuotationMarks = + [ + new QuotationMarkMetadata( + "\"", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().SetText("\"test \" text").Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\"", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().SetText("\"test \" text").Build(), + 6, + 7 + ), + ]; + + AssertResolvedQuotationMarksEqual(actualResolvedQuotationMarks, expectedResolvedQuotationMarks); + } + + [Test] + public void SimpleQuotationMarkResolutionWithPreviousClosingMark() + { + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + Assert.IsNotNull(englishQuoteConvention); + + var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( + new QuotationMarkUpdateResolutionSettings(englishQuoteConvention.Normalize()) + ); + + var actualResolvedQuotationMarks = basicQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test\" \" text").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test\" \" text").Build(), 6, 7), + ] + ) + .ToList(); + List expectedResolvedQuotationMarks = + [ + new QuotationMarkMetadata( + "\"", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().SetText("test\" \" text").Build(), + 4, + 5 + ), + new QuotationMarkMetadata( + "\"", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().SetText("test\" \" text").Build(), + 6, + 7 + ) + ]; + + AssertResolvedQuotationMarksEqual(actualResolvedQuotationMarks, expectedResolvedQuotationMarks); + } + + [Test] + public void IsOpeningQuote() + { + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + Assert.IsNotNull(englishQuoteConvention); + + var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( + new QuotationMarkUpdateResolutionSettings(englishQuoteConvention.Normalize()) + ); + + // valid opening quote at start of segment + var quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"test text\"").Build(), 0, 1); + Assert.IsTrue(basicQuotationMarkResolver.IsOpeningQuotationMark(quoteMatch)); + + // opening quote with leading whitespace + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test \"text\"").Build(), 5, 6); + Assert.IsTrue(basicQuotationMarkResolver.IsOpeningQuotationMark(quoteMatch)); + + // opening quote with quote introducer + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test:\"text\"").Build(), 5, 6); + Assert.IsTrue(basicQuotationMarkResolver.IsOpeningQuotationMark(quoteMatch)); + + // QuotationMarkStringMatch indices don't indicate a quotation mark + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test \"text\"").Build(), 0, 1); + Assert.IsFalse(basicQuotationMarkResolver.IsOpeningQuotationMark(quoteMatch)); + + // the quotation mark is not valid under the current quote convention + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("").Build(), 10, 11); + Assert.IsFalse(basicQuotationMarkResolver.IsClosingQuotationMark(quoteMatch)); + + // no trailing whitespace after quotation mark + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"test\"text").Build(), 5, 6); + Assert.IsFalse(basicQuotationMarkResolver.IsClosingQuotationMark(quoteMatch)); + + // opening quote at the start of the segment + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"test text\"").Build(), 0, 1); + Assert.IsFalse(basicQuotationMarkResolver.IsClosingQuotationMark(quoteMatch)); + + // opening quote with leading whitespace + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test \"text\"").Build(), 5, 6); + Assert.IsFalse(basicQuotationMarkResolver.IsClosingQuotationMark(quoteMatch)); + } + + [Test] + public void IsClosingQuoteWithUnambiguousQuoteConvention() + { + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + Assert.IsNotNull(englishQuoteConvention); + + var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( + new QuoteConventionDetectionResolutionSettings(new QuoteConventionSet([englishQuoteConvention])) + ); + + // unambiguous closing quote at end of segment + var quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("“test text”").Build(), 10, 11); + Assert.IsTrue(basicQuotationMarkResolver.IsClosingQuotationMark(quoteMatch)); + + // unambiguous closing quote with trailing whitespace + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("“test” text").Build(), 5, 6); + Assert.IsTrue(basicQuotationMarkResolver.IsClosingQuotationMark(quoteMatch)); + + // unambiguous closing quote without the "correct" context + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("“test”text").Build(), 5, 6); + Assert.IsTrue(basicQuotationMarkResolver.IsClosingQuotationMark(quoteMatch)); + + // unambiguous opening quote + quoteMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("test “text”").Build(), 5, 6); + Assert.IsFalse(basicQuotationMarkResolver.IsClosingQuotationMark(quoteMatch)); + } + + [Test] + public void ResolveOpeningQuote() + { + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + Assert.IsNotNull(englishQuoteConvention); + + var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( + new QuotationMarkUpdateResolutionSettings(englishQuoteConvention.Normalize()) + ); + + var expectedResolvedQuotationMark = new QuotationMarkMetadata( + "\"", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().SetText("\"test text\"").Build(), + 0, + 1 + ); + QuotationMarkMetadata actualResolvedQuotationMark = basicQuotationMarkResolver.ResolveOpeningMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"test text\"").Build(), 0, 1) + ); + Assert.That(actualResolvedQuotationMark, Is.EqualTo(expectedResolvedQuotationMark)); + Assert.That(basicQuotationMarkResolver.LastQuotationMark, Is.EqualTo(actualResolvedQuotationMark)); + } + + [Test] + public void ResolveClosingQuote() + { + QuoteConvention englishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + Assert.IsNotNull(englishQuoteConvention); + + var basicQuotationMarkResolver = new FallbackQuotationMarkResolver( + new QuotationMarkUpdateResolutionSettings(englishQuoteConvention.Normalize()) + ); + + var expectedResolvedQuotationMark = new QuotationMarkMetadata( + "\"", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().SetText("\"test text\"").Build(), + 10, + 11 + ); + QuotationMarkMetadata actualResolvedQuotationMark = basicQuotationMarkResolver.ResolveClosingMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"test text\"").Build(), 10, 11) + ); + Assert.That(actualResolvedQuotationMark, Is.EqualTo(expectedResolvedQuotationMark)); + } + + public void AssertResolvedQuotationMarksEqual( + List actualResolvedQuotationMarks, + List expectedResolvedQuotationMarks + ) + { + Assert.That(actualResolvedQuotationMarks.Count, Is.EqualTo(expectedResolvedQuotationMarks.Count)); + foreach ( + (QuotationMarkMetadata actualMark, QuotationMarkMetadata expectedMark) in actualResolvedQuotationMarks.Zip( + expectedResolvedQuotationMarks + ) + ) + { + Assert.That(actualMark, Is.EqualTo(expectedMark)); + } + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/MemoryParatextProjectQuoteConvetionDetector.cs b/tests/SIL.Machine.Tests/Corpora/MemoryParatextProjectQuoteConvetionDetector.cs new file mode 100644 index 00000000..01d959d8 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/MemoryParatextProjectQuoteConvetionDetector.cs @@ -0,0 +1,23 @@ +using System.Text; + +namespace SIL.Machine.Corpora; + +public class MemoryParatextProjectQuoteConventionDetector( + ParatextProjectSettings settings, + IDictionary files +) : ParatextProjectQuoteConventionDetector(settings) +{ + public IDictionary Files { get; } = files; + + protected override bool Exists(string fileName) + { + return Files.ContainsKey(fileName); + } + + protected override Stream? Open(string fileName) + { + if (!Files.TryGetValue(fileName, out string? contents)) + return null; + return new MemoryStream(Encoding.UTF8.GetBytes(contents)); + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/ParatextProjectQuoteConvetionDetectorTests.cs b/tests/SIL.Machine.Tests/Corpora/ParatextProjectQuoteConvetionDetectorTests.cs new file mode 100644 index 00000000..46a75faa --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/ParatextProjectQuoteConvetionDetectorTests.cs @@ -0,0 +1,82 @@ +using System.Text; +using NUnit.Framework; +using SIL.Machine.PunctuationAnalysis; +using SIL.Scripture; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class ParatextProjectQuoteConventionDetectorTests +{ + [Test] + public void TestGetQuotationAnalysis() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "41MATTest.SFM", + @"\id MAT +\c 1 +\v 1 Someone said, “This is something I am saying! +\v 2 This is also something I am saying” (that is, “something I am speaking”). +\p +\v 3 Other text, and someone else said, +\q1 +\v 4 “Things +\q2 someone else said! +\q3 and more things someone else said.” +\m That is why he said “things someone else said.” +\v 5 Then someone said, “More things someone said.”" + } + } + ); + QuoteConventionAnalysis analysis = env.GetQuoteConvention(); + Assert.That(analysis, Is.Not.Null); + Assert.That(analysis.BestQuoteConventionScore, Is.GreaterThan(0.8)); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_english")); + } + + private class TestEnvironment(ParatextProjectSettings? settings = null, Dictionary? files = null) + { + public ParatextProjectQuoteConventionDetector Detector { get; } = + new MemoryParatextProjectQuoteConventionDetector( + settings ?? new DefaultParatextProjectSettings(), + files ?? new() + ); + + public QuoteConventionAnalysis GetQuoteConvention() + { + return Detector.GetQuoteConventionAnalysis(); + } + } + + private class DefaultParatextProjectSettings( + string name = "Test", + string fullName = "TestProject", + Encoding? encoding = null, + ScrVers? versification = null, + UsfmStylesheet? stylesheet = null, + string fileNamePrefix = "", + string fileNameForm = "41MAT", + string fileNameSuffix = "Test.SFM", + string biblicalTermsListType = "Project", + string biblicalTermsProjectName = "Test", + string biblicalTermsFileName = "ProjectBiblicalTerms.xml", + string languageCode = "en" + ) + : ParatextProjectSettings( + name, + fullName, + encoding ?? Encoding.UTF8, + versification ?? ScrVers.English, + stylesheet ?? new UsfmStylesheet("usfm.sty"), + fileNamePrefix, + fileNameForm, + fileNameSuffix, + biblicalTermsListType, + biblicalTermsProjectName, + biblicalTermsFileName, + languageCode + ) { } +} diff --git a/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs index 7c17e7cf..9769e475 100644 --- a/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs @@ -15,7 +15,23 @@ public void UpdateUsfm_ParagraphMarkers() string source = "This is the first paragraph. This text is in English, and this test is for paragraph markers."; string pretranslation = "Este es el primer párrafo. Este texto está en inglés y esta prueba es para marcadores de párrafo."; - IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), pretranslation)]; + PlaceMarkersAlignmentInfo alignInfo = new PlaceMarkersAlignmentInfo( + sourceTokens: Tokenizer.Tokenize(source).ToList(), + translationTokens: Tokenizer.Tokenize(pretranslation).ToList(), + alignment: ToWordAlignmentMatrix( + "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" + ), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ); + IReadOnlyList rows = + [ + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + pretranslation, + new Dictionary { { "alignment_info", alignInfo } } + ) + ]; string usfm = @"\id MAT \c 1 @@ -23,23 +39,12 @@ public void UpdateUsfm_ParagraphMarkers() \p This text is in English, \p and this test is for paragraph markers. "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: Tokenizer.Tokenize(source).ToList(), - translationTokens: Tokenizer.Tokenize(pretranslation).ToList(), - alignment: ToWordAlignmentMatrix( - "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" - ) - ) - ]; string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -59,29 +64,34 @@ public void UpdateUsfm_StyleMarkers() string source = "This is the first sentence. This text is in English, and this test is for style markers."; string pretranslation = "Esta es la primera oración. Este texto está en inglés y esta prueba es para marcadores de estilo."; - IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), pretranslation)]; + PlaceMarkersAlignmentInfo alignInfo = new PlaceMarkersAlignmentInfo( + sourceTokens: Tokenizer.Tokenize(source).ToList(), + translationTokens: Tokenizer.Tokenize(pretranslation).ToList(), + alignment: ToWordAlignmentMatrix( + "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" + ), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Preserve + ); + IReadOnlyList rows = + [ + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + pretranslation, + new Dictionary { { "alignment_info", alignInfo } } + ) + ]; string usfm = @"\id MAT \c 1 \v 1 This is the \w first\w* sentence. This text is in \w English\w*, and this test is \w for\w* style markers. "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: Tokenizer.Tokenize(source).ToList(), - translationTokens: Tokenizer.Tokenize(pretranslation).ToList(), - alignment: ToWordAlignmentMatrix( - "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" - ) - ) - ]; string target = UpdateUsfm( rows, usfm, styleBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -92,11 +102,29 @@ public void UpdateUsfm_StyleMarkers() AssertUsfmEquals(target, result); + alignInfo = new PlaceMarkersAlignmentInfo( + sourceTokens: Tokenizer.Tokenize(source).ToList(), + translationTokens: Tokenizer.Tokenize(pretranslation).ToList(), + alignment: ToWordAlignmentMatrix( + "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" + ), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ); + rows = + [ + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + pretranslation, + new Dictionary { { "alignment_info", alignInfo } } + ) + ]; + target = UpdateUsfm( rows, usfm, styleBehavior: UpdateUsfmMarkerBehavior.Strip, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); result = @@ -112,16 +140,16 @@ public void UpdateUsfm_StyleMarkers() [Test] public void UpdateUsfm_EmbedMarkers() { - IReadOnlyList<(IReadOnlyList, string)> rows = + IReadOnlyList rows = [ - (ScrRef("MAT 1:1"), "New verse 1"), - (ScrRef("MAT 1:2"), "New verse 2"), - (ScrRef("MAT 1:3"), "New verse 3"), - (ScrRef("MAT 1:4"), "New verse 4"), - (ScrRef("MAT 1:4/1:f"), "New embed text"), - (ScrRef("MAT 1:5"), "New verse 5"), - (ScrRef("MAT 1:6"), "New verse 6"), - (ScrRef("MAT 1:6/1:f"), "New verse 6 embed text") + new UpdateUsfmRow(ScrRef("MAT 1:1"), "New verse 1"), + new UpdateUsfmRow(ScrRef("MAT 1:2"), "New verse 2"), + new UpdateUsfmRow(ScrRef("MAT 1:3"), "New verse 3"), + new UpdateUsfmRow(ScrRef("MAT 1:4"), "New verse 4"), + new UpdateUsfmRow(ScrRef("MAT 1:4/1:f"), "New embed text"), + new UpdateUsfmRow(ScrRef("MAT 1:5"), "New verse 5"), + new UpdateUsfmRow(ScrRef("MAT 1:6"), "New verse 6"), + new UpdateUsfmRow(ScrRef("MAT 1:6/1:f"), "New verse 6 embed text") ]; string usfm = @"\id MAT @@ -133,13 +161,12 @@ public void UpdateUsfm_EmbedMarkers() \v 5 Embed with style markers \f \fr 1.5 \ft A \+w stylish\+w* note \f* \v 6 Updated embed with style markers \f \fr 1.6 \ft Another \+w stylish\+w* note \f* "; - IReadOnlyList alignInfo = []; string target = UpdateUsfm( rows, usfm, embedBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -159,7 +186,7 @@ public void UpdateUsfm_EmbedMarkers() rows, usfm, embedBehavior: UpdateUsfmMarkerBehavior.Strip, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); result = @@ -179,7 +206,21 @@ public void UpdateUsfm_EmbedMarkers() [Test] public void UpdateUsfm_TrailingEmptyParagraphs() { - IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), "New verse 1")]; + PlaceMarkersAlignmentInfo alignInfo = new PlaceMarkersAlignmentInfo( + sourceTokens: ["Verse", "1"], + translationTokens: ["New", "verse", "1"], + alignment: ToWordAlignmentMatrix("0-1 1-2"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ); + IReadOnlyList rows = + [ + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "New verse 1", + new Dictionary { { "alignment_info", alignInfo } } + ) + ]; string usfm = @"\id MAT \c 1 @@ -188,21 +229,12 @@ public void UpdateUsfm_TrailingEmptyParagraphs() \b \q1 \f embed 2 \f* "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: ["Verse", "1"], - translationTokens: ["New", "verse", "1"], - alignment: ToWordAlignmentMatrix("0-1 1-2") - ) - ]; string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -219,12 +251,44 @@ public void UpdateUsfm_TrailingEmptyParagraphs() [Test] public void UpdateUsfm_Headers() { - IReadOnlyList<(IReadOnlyList, string)> rows = + IReadOnlyList rows = [ - (ScrRef("MAT 1:1"), "X Y Z"), - (ScrRef("MAT 1:2"), "X"), - (ScrRef("MAT 1:3"), "Y"), - (ScrRef("MAT 1:3/1:s1"), "Updated header") + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "X Y Z", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["A", "B", "C"], + translationTokens: ["X", "Y", "Z"], + alignment: ToWordAlignmentMatrix("0-0 1-1 2-2"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + ), + new UpdateUsfmRow( + ScrRef("MAT 1:2"), + "X", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["A"], + translationTokens: ["X"], + alignment: ToWordAlignmentMatrix("0-0"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + ), + new UpdateUsfmRow(ScrRef("MAT 1:3"), "Y"), + new UpdateUsfmRow(ScrRef("MAT 1:3/1:s1"), "Updated header") ]; string usfm = @"\id MAT @@ -248,27 +312,12 @@ public void UpdateUsfm_Headers() \v 3 B \s1 Header to be updated "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: ["A", "B", "C"], - translationTokens: ["X", "Y", "Z"], - alignment: ToWordAlignmentMatrix("0-0 1-1 2-2") - ), - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:2"], - sourceTokens: ["A"], - translationTokens: ["X"], - alignment: ToWordAlignmentMatrix("0-0") - ) - ]; string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -300,29 +349,39 @@ public void UpdateUsfm_Headers() [Test] public void UpdateUsfm_ConsecutiveMarkers() { - IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), "New verse 1 WORD"),]; + IReadOnlyList rows = + [ + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "New verse 1 WORD", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["Old", "verse", "1", "word"], + translationTokens: ["New", "verse", "1", "WORD"], + alignment: ToWordAlignmentMatrix("0-0 1-1 2-2 3-3"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Preserve + ) + } + } + ), + ]; string usfm = @"\id MAT \c 1 \v 1 Old verse 1 \p \qt \+w word\+w*\qt* "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: ["Old", "verse", "1", "word"], - translationTokens: ["New", "verse", "1", "WORD"], - alignment: ToWordAlignmentMatrix("0-0 1-1 2-2 3-3") - ) - ]; string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, styleBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -338,34 +397,38 @@ public void UpdateUsfm_ConsecutiveMarkers() [Test] public void UpdateUsfm_VerseRanges() { - IReadOnlyList<(IReadOnlyList, string)> rows = - [ - ( - Enumerable.Range(1, 6).Select(i => ScriptureRef.Parse($"MAT 1:{i}")).ToList(), - "New verse range text new paragraph 2" - ) - ]; + IReadOnlyList rows = Enumerable + .Range(1, 6) + .Select(i => new UpdateUsfmRow( + [ScriptureRef.Parse($"MAT 1:{i}")], + "New verse range text new paragraph 2", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["Verse", "range", "old", "paragraph", "2"], + translationTokens: ["New", "verse", "range", "text", "new", "paragraph", "2"], + alignment: ToWordAlignmentMatrix("0-1 1-2 2-4 3-5 4-6"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + )) + .ToList(); string usfm = @"\id MAT \c 1 \v 1-5 Verse range \p old paragraph 2 "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: Enumerable.Range(1, 6).Select(i => ScriptureRef.Parse($"MAT 1:{i}").ToString()).ToList(), - sourceTokens: ["Verse", "range", "old", "paragraph", "2"], - translationTokens: ["New", "verse", "range", "text", "new", "paragraph", "2"], - alignment: ToWordAlignmentMatrix("0-1 1-2 2-4 3-5 4-6") - ) - ]; string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -381,9 +444,26 @@ public void UpdateUsfm_VerseRanges() [Test] public void UpdateUsfm_NoUpdate() { - IReadOnlyList<(IReadOnlyList, string)> rows = + //Strip paragraphs + IReadOnlyList rows = [ - (ScrRef("MAT 1:1"), "New paragraph 1 New paragraph 2"), + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "New paragraph 1 New paragraph 2", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["Old", "paragraph", "1", "Old", "paragraph", "2"], + translationTokens: ["New", "paragraph", "1", "New", "paragraph", "2"], + alignment: ToWordAlignmentMatrix("0-0 1-1 2-2 3-3 4-4 5-5"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Strip, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + ), ]; string usfm = @"\id MAT @@ -392,22 +472,11 @@ public void UpdateUsfm_NoUpdate() \p Old paragraph 2 "; - //Strip paragraphs - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: ["Old", "paragraph", "1", "Old", "paragraph", "2"], - translationTokens: ["New", "paragraph", "1", "New", "paragraph", "2"], - alignment: ToWordAlignmentMatrix("0-0 1-1 2-2 3-3 4-4 5-5") - ) - ]; - string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Strip, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -419,21 +488,32 @@ public void UpdateUsfm_NoUpdate() AssertUsfmEquals(target, result); //No alignment - alignInfo = + rows = [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: [], - translationTokens: [], - alignment: ToWordAlignmentMatrix("") - ) + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "New paragraph 1 New paragraph 2", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: [], + translationTokens: [], + alignment: ToWordAlignmentMatrix(""), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + ), ]; target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); result = @@ -447,12 +527,11 @@ public void UpdateUsfm_NoUpdate() // No text update rows = []; - alignInfo = []; target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); result = @@ -467,9 +546,25 @@ public void UpdateUsfm_NoUpdate() [Test] public void UpdateUsfm_SplitTokens() { - IReadOnlyList<(IReadOnlyList, string)> rows = + IReadOnlyList rows = [ - (ScrRef("MAT 1:1"), "words split words split words split"), + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "words split words split words split", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["words", "split", "words", "split", "words", "split"], + translationTokens: ["words", "split", "words", "split", "words", "split"], + alignment: ToWordAlignmentMatrix("0-0 1-1 2-2 3-3 4-4 5-5"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + ), ]; string usfm = @"\id MAT @@ -479,21 +574,11 @@ public void UpdateUsfm_SplitTokens() \p it words split "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: ["words", "split", "words", "split", "words", "split"], - translationTokens: ["words", "split", "words", "split", "words", "split"], - alignment: ToWordAlignmentMatrix("0-0 1-1 2-2 3-3 4-4 5-5") - ) - ]; - string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -510,29 +595,38 @@ public void UpdateUsfm_SplitTokens() [Test] public void UpdateUsfm_NoText() { - IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), ""),]; + IReadOnlyList rows = + [ + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: [], + translationTokens: [], + alignment: ToWordAlignmentMatrix(""), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Preserve + ) + } + } + ), + ]; string usfm = @"\id MAT \c 1 \v 1 \w \w* "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: [], - translationTokens: [], - alignment: ToWordAlignmentMatrix("") - ) - ]; - string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, styleBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -547,7 +641,26 @@ public void UpdateUsfm_NoText() [Test] public void UpdateUsfm_ConsecutiveSubstring() { - IReadOnlyList<(IReadOnlyList, string)> rows = [(ScrRef("MAT 1:1"), "string ring"),]; + IReadOnlyList rows = + [ + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "string ring", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["string", "ring"], + translationTokens: ["string", "ring"], + alignment: ToWordAlignmentMatrix("0-0 1-1"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + ), + ]; string usfm = @"\id MAT \c 1 @@ -555,21 +668,11 @@ public void UpdateUsfm_ConsecutiveSubstring() \p ring "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: ["string", "ring"], - translationTokens: ["string", "ring"], - alignment: ToWordAlignmentMatrix("0-0 1-1") - ) - ]; - string target = UpdateUsfm( rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -585,10 +688,42 @@ public void UpdateUsfm_ConsecutiveSubstring() [Test] public void UpdateUsfm_VersesOutOfOrder() { - IReadOnlyList<(IReadOnlyList, string)> rows = + IReadOnlyList rows = [ - (ScrRef("MAT 1:1"), "new verse 1 new paragraph 2"), - (ScrRef("MAT 1:2"), "new verse 2") + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "new verse 1 new paragraph 2", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["verse", "1", "paragraph", "2"], + translationTokens: ["new", "verse", "1", "new", "paragraph", "2"], + alignment: ToWordAlignmentMatrix("0-1 1-2 2-4 3-5"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + ), + new UpdateUsfmRow( + ScrRef("MAT 1:2"), + "new verse 2", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["verse", "2"], + translationTokens: ["new", "verse", "2"], + alignment: ToWordAlignmentMatrix("0-1 1-2"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve, + styleBehavior: UpdateUsfmMarkerBehavior.Strip + ) + } + } + ) ]; string usfm = @"\id MAT @@ -598,27 +733,13 @@ public void UpdateUsfm_VersesOutOfOrder() \p paragraph 2 "; - IReadOnlyList alignInfo = - [ - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:1"], - sourceTokens: ["verse", "1", "paragraph", "2"], - translationTokens: ["new", "verse", "1", "new", "paragraph", "2"], - alignment: ToWordAlignmentMatrix("0-1 1-2 2-4 3-5") - ), - new PlaceMarkersAlignmentInfo( - refs: ["MAT 1:2"], - sourceTokens: ["verse", "2"], - translationTokens: ["new", "verse", "2"], - alignment: ToWordAlignmentMatrix("0-1 1-2") - ) - ]; + IReadOnlyList alignInfo = []; string target = UpdateUsfm( rows, usfm, textBehavior: UpdateUsfmTextBehavior.StripExisting, - usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler(alignInfo)] + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] ); string result = @@ -632,6 +753,58 @@ public void UpdateUsfm_VersesOutOfOrder() AssertUsfmEquals(target, result); } + [Test] + public void UpdateUsfm_StripParagraphsWithHeaders() + { + IReadOnlyList rows = + [ + new UpdateUsfmRow( + ScrRef("MAT 1:1"), + "new verse 1 new paragraph 2", + new Dictionary + { + { + "alignment_info", + new PlaceMarkersAlignmentInfo( + sourceTokens: ["verse", "1", "paragraph", "2"], + translationTokens: ["new", "verse", "1", "new", "paragraph", "2"], + alignment: ToWordAlignmentMatrix("0-1 1-2 2-4 3-5"), + paragraphBehavior: UpdateUsfmMarkerBehavior.Strip, + styleBehavior: UpdateUsfmMarkerBehavior.Preserve + ) + } + } + ), + ]; + string usfm = + @"\id MAT +\c 1 +\v 1 verse 1 +\s header +\p paragraph 2 +\v 2 verse 2 +"; + + string target = UpdateUsfm( + rows, + usfm, + paragraphBehavior: UpdateUsfmMarkerBehavior.Strip, + styleBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()] + ); + + string result = + @"\id MAT +\c 1 +\v 1 new verse 1 new paragraph 2 +\s header +\p +\v 2 verse 2 +"; + + AssertUsfmEquals(target, result); + } + private static ScriptureRef[] ScrRef(params string[] refs) { return refs.Select(r => ScriptureRef.Parse(r)).ToArray(); @@ -653,7 +826,7 @@ private static WordAlignmentMatrix ToWordAlignmentMatrix(string alignment) } private static string UpdateUsfm( - IReadOnlyList<(IReadOnlyList, string)> rows, + IReadOnlyList rows, string source, string? idText = null, UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferNew, diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs new file mode 100644 index 00000000..6b9fcfdc --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs @@ -0,0 +1,61 @@ +using NUnit.Framework; +using SIL.Machine.PunctuationAnalysis; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class QuotationDenormalizationTests +{ + [Test] + public void FullQuotationDenormalizationPipeline() + { + string normalizedUsfm = + @" + \id GEN + \c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + 'You shall not eat of any tree of the garden'?"" + \v 2 The woman said to the serpent, + ""We may eat fruit from the trees of the garden, + \v 3 but not the fruit of the tree which is in the middle of the garden. + God has said, 'You shall not eat of it. You shall not touch it, lest you die.'"" + "; + + string expectedDenormalizedUsfm = + @"\id GEN +\c 1 +\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?” +\v 2 The woman said to the serpent, “We may eat fruit from the trees of the garden, +\v 3 but not the fruit of the tree which is in the middle of the garden. God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’” +"; + + QuoteConvention standardEnglishQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + "standard_english" + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + + var quotationMarkDenormalizationFirstPass = new QuotationMarkDenormalizationFirstPass( + standardEnglishQuoteConvention, + standardEnglishQuoteConvention + ); + + UsfmParser.Parse(normalizedUsfm, quotationMarkDenormalizationFirstPass); + List bestChapterStrategies = + quotationMarkDenormalizationFirstPass.FindBestChapterStrategies(); + + var quotationMarkDenormalizer = new QuotationMarkDenormalizationUsfmUpdateBlockHandler( + standardEnglishQuoteConvention, + standardEnglishQuoteConvention, + new QuotationMarkUpdateSettings(chapterStrategies: bestChapterStrategies) + ); + + var updater = new UpdateUsfmParserHandler(updateBlockHandlers: [quotationMarkDenormalizer]); + UsfmParser.Parse(normalizedUsfm, updater); + + string actualDenormalizedUsfm = updater.GetUsfm(); + + Assert.That(actualDenormalizedUsfm, Is.EqualTo(expectedDenormalizedUsfm).IgnoreLineEndings()); + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs new file mode 100644 index 00000000..c265d36f --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs @@ -0,0 +1,499 @@ +using NUnit.Framework; +using SIL.Machine.PunctuationAnalysis; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class QuotationMarkDenormalizationUsfmUpdateBlockHandlerTests +{ + private const string SimpleNormalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + 'You shall not eat of any tree of the garden'?"" + "; + + [Test] + public void SimpleEnglishQuoteDenormalization() + { + string normalizedUsfm = SimpleNormalizedUsfm; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" + ); + + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "standard_english"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleBritishEnglishQuoteDenormalization() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, 'Has God really said, + ""You shall not eat of any tree of the garden""?' + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?’" + ); + + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "british_english", "british_english"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + + // no denormalization should be needed for this example + } + + [Test] + public void SimpleTypewriterEnglishQuoteDenormalization() + { + string normalizedUsfm = SimpleNormalizedUsfm; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, \"Has God really said, 'You shall not eat of any tree of the garden'?\"" + ); + + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "typewriter_english"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + + // some of the quotes shouldn't need to be denormalized + } + + [Test] + public void SimpleHybridTypewriterEnglishQuoteDenormalization() + { + string normalizedUsfm = SimpleNormalizedUsfm; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, 'You shall not eat of any tree of the garden'?”" + ); + + string observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "standard_english", + "hybrid_typewriter_english" + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + + // the single guillemets shouldn't need to be denormalized + // because Moses doesn't normalize them + } + + [Test] + public void SimpleFrenchQuoteDenormalization() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ‹You shall not eat of any tree of the garden›?"" + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, «Has God really said, ‹You shall not eat of any tree of the garden›?»" + ); + + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_french", "standard_french"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + + // the unusual quotation marks shouldn't need to be denormalized + } + + [Test] + public void SimpleTypewriterFrenchQuoteDenormalization() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <?>> + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, <?>>" + ); + + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "typewriter_french", "typewriter_french"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + + // the 1st- and 2nd-level quotes are denormalized to identical marks + } + + [Test] + public void SimpleWesternEuropeanQuoteDenormalization() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ""You shall not eat of any tree of the garden""?"" + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, «Has God really said, “You shall not eat of any tree of the garden”?»" + ); + + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "western_european", "western_european"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleTypewriterWesternEuropeanQuoteDenormalization() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <> + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, <>" + ); + + string observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "typewriter_western_european", + "typewriter_western_european" + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleTypewriterWesternEuropeanVariantQuoteDenormalization() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ?"" + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, \"Has God really said, ?\"" + ); + + string observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "typewriter_western_european_variant", + "typewriter_western_european_variant" + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleHybridTypewriterWesternEuropeanQuoteDenormalization() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ""You shall not eat of any tree of the garden""?"" + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, «Has God really said, \"You shall not eat of any tree of the garden\"?»" + ); + + string observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "hybrid_typewriter_western_european", + "hybrid_typewriter_western_european" + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleCentralEuropeanQuoteDenormalization() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ""You shall not eat of any tree of the garden""?"" + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, „Has God really said, ‚You shall not eat of any tree of the garden‘?“" + ); + + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "central_european", "central_european"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleCentralEuropeanGuillemetsQuoteDenormalization() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ›You shall not eat of any tree of the garden‹?"" + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, »Has God really said, ›You shall not eat of any tree of the garden‹?«" + ); + + string observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "central_european_guillemets", + "central_european_guillemets" + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleSwedishQuoteDenormalization() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + 'You shall not eat of any tree of the garden'?"" + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ”Has God really said, ’You shall not eat of any tree of the garden’?”" + ); + + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_swedish", "standard_swedish"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleFinnishQuoteDenormalization() + { + string normalizedUsfm = SimpleNormalizedUsfm; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, »Has God really said, ’You shall not eat of any tree of the garden’?»" + ); + + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "standard_finnish"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleEasternEuropeanQuoteDenormalization() + { + string normalizedUsfm = SimpleNormalizedUsfm; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, „Has God really said, ‚You shall not eat of any tree of the garden’?”" + ); + + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "eastern_european"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleRussianQuoteDenormalization() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ""You shall not eat of any tree of the garden""?"" + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, «Has God really said, „You shall not eat of any tree of the garden“?»" + ); + + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_russian", "standard_russian"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SimpleArabicQuoteDenormalization() + { + string normalizedUsfm = SimpleNormalizedUsfm; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ”Has God really said, ’You shall not eat of any tree of the garden‘?“" + ); + + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "standard_arabic"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void FallbackQuotationDenormalizationSameAsFull() + { + string normalizedUsfm = SimpleNormalizedUsfm; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" + ); + + string observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "standard_english", + "standard_english", + new QuotationMarkUpdateSettings(QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void FallbackQuotationDenormalizationIncorrectlyNested() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ""You shall not eat of any tree of the garden""?"" + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, “You shall not eat of any tree of the garden”?”" + ); + + string observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "standard_english", + "standard_english", + new QuotationMarkUpdateSettings(QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void FallbackQuotationDenormalizationIncorrectlyNestedSecondCase() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, 'Has God really said, + ""You shall not eat of any tree of the garden""?' + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?’" + ); + + string observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "standard_english", + "standard_english", + new QuotationMarkUpdateSettings(QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void FallbackQuotationDenormalizationUnclosedQuote() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + You shall not eat of any tree of the garden'?"" + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ); + + string observedUsfm = DenormalizeQuotationMarks( + normalizedUsfm, + "standard_english", + "standard_english", + new QuotationMarkUpdateSettings(QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + public string DenormalizeQuotationMarks( + string normalizedUsfm, + string sourceQuoteConventionName, + string targetQuoteConventionName, + QuotationMarkUpdateSettings? quotationDenormalizationSettings = null + ) + { + quotationDenormalizationSettings ??= new QuotationMarkUpdateSettings(); + QuotationMarkDenormalizationUsfmUpdateBlockHandler quotationDenormalizer = ( + CreateQuotationDenormalizationUsfmUpdateBlockHandler( + sourceQuoteConventionName, + targetQuoteConventionName, + quotationDenormalizationSettings + ) + ); + + var updater = new UpdateUsfmParserHandler(updateBlockHandlers: [quotationDenormalizer]); + UsfmParser.Parse(normalizedUsfm, updater); + + return updater.GetUsfm(); + } + + public QuotationMarkDenormalizationUsfmUpdateBlockHandler CreateQuotationDenormalizationUsfmUpdateBlockHandler( + string sourceQuoteConventionName, + string targetQuoteConventionName, + QuotationMarkUpdateSettings? quotationDenormalizationSettings = null + ) + { + quotationDenormalizationSettings ??= new QuotationMarkUpdateSettings(); + QuoteConvention sourceQuoteConvention = GetQuoteConventionByName(sourceQuoteConventionName); + QuoteConvention targetQuoteConvention = GetQuoteConventionByName(targetQuoteConventionName); + + return new QuotationMarkDenormalizationUsfmUpdateBlockHandler( + sourceQuoteConvention, + targetQuoteConvention, + quotationDenormalizationSettings + ); + } + + public void AssertUsfmEqual(string observedUsfm, string expectedUsfm) + { + foreach ((string observedLine, string expectedLine) in observedUsfm.Split("\n").Zip(expectedUsfm.Split("\n"))) + { + Assert.That(observedLine.Trim(), Is.EqualTo(expectedLine.Trim())); + } + } + + public QuoteConvention GetQuoteConventionByName(string name) + { + QuoteConvention quoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(name); + Assert.IsNotNull(quoteConvention); + return quoteConvention; + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs new file mode 100644 index 00000000..2f4ba189 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/QuotationMarkUpdateFirstPassTests.cs @@ -0,0 +1,786 @@ +using NUnit.Framework; +using SIL.Machine.PunctuationAnalysis; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class QuotationMarkUpdateFirstPassTests +{ + [Test] + public void CheckWhetherFallbackModeWillWork() + { + var firstPassAnalyzer = new QuotationMarkUpdateFirstPass( + new QuoteConvention("", []), + new QuoteConvention("", []) + ); + + // Cases where we expect fallback mode to work + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_english"), + GetQuoteConventionByName("standard_english") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_french"), + GetQuoteConventionByName("british_english") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("typewriter_western_european"), + GetQuoteConventionByName("standard_russian") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("typewriter_western_european_variant"), + GetQuoteConventionByName("standard_arabic") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("central_european"), + GetQuoteConventionByName("british_typewriter_english") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_swedish"), + GetQuoteConventionByName("typewriter_french") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_finnish"), + GetQuoteConventionByName("british_inspired_western_european") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("eastern_european"), + GetQuoteConventionByName("central_european") + ) + ); + + // Cases where we expect fallback mode to fail + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_english"), + GetQuoteConventionByName("western_european") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("typewriter_french"), + GetQuoteConventionByName("western_european") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_french"), + GetQuoteConventionByName("french_variant") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("central_european"), + GetQuoteConventionByName("typewriter_western_european") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("eastern_european"), + GetQuoteConventionByName("standard_russian") + ) + ); + } + + [Test] + public void CheckWhetherFallbackModeWillWorkWithNormalizedConventions() + { + var firstPassAnalyzer = new QuotationMarkUpdateFirstPass( + new QuoteConvention("", []), + new QuoteConvention("", []) + ); + + // Cases where we expect fallback mode to work + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_english").Normalize(), + GetQuoteConventionByName("standard_english") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_french").Normalize(), + GetQuoteConventionByName("british_english") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("typewriter_western_european").Normalize(), + GetQuoteConventionByName("standard_russian") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("typewriter_western_european_variant").Normalize(), + GetQuoteConventionByName("standard_arabic") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("central_european").Normalize(), + GetQuoteConventionByName("british_typewriter_english") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_swedish").Normalize(), + GetQuoteConventionByName("typewriter_french") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_finnish").Normalize(), + GetQuoteConventionByName("british_inspired_western_european") + ) + ); + Assert.IsTrue( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("eastern_european").Normalize(), + GetQuoteConventionByName("central_european") + ) + ); + + // Cases where we expect fallback mode to fail + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("western_european").Normalize(), + GetQuoteConventionByName("standard_english") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("french_variant").Normalize(), + GetQuoteConventionByName("hybrid_typewriter_english") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("british_inspired_western_european").Normalize(), + GetQuoteConventionByName("standard_russian") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("typewriter_english").Normalize(), + GetQuoteConventionByName("western_european") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("central_european_guillemets").Normalize(), + GetQuoteConventionByName("french_variant") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_arabic").Normalize(), + GetQuoteConventionByName("hybrid_typewriter_english") + ) + ); + Assert.IsFalse( + firstPassAnalyzer.CheckWhetherFallbackModeWillWork( + GetQuoteConventionByName("standard_russian").Normalize(), + GetQuoteConventionByName("standard_french") + ) + ); + } + + [Test] + public void ChooseBestActionForChapter() + { + // Verse text with no issues + QuotationMarkUpdateStrategy actualAction = RunFirstPassOnChapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + "He said to the woman, “Has God really said, " + + "‘You shall not eat of any tree of the garden’?”" + ], + "standard_english", + "standard_english" + ); + QuotationMarkUpdateStrategy expectedAction = QuotationMarkUpdateStrategy.ApplyFull; + Assert.That(actualAction, Is.EqualTo(expectedAction)); + + // Verse text with unpaired opening quotation mark + actualAction = RunFirstPassOnChapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + "He said to the woman, “Has God really said, " + + "‘You shall not eat of any tree of the garden’?" + ], + "standard_english", + "standard_english" + ); + expectedAction = QuotationMarkUpdateStrategy.ApplyFallback; + Assert.That(actualAction, Is.EqualTo(expectedAction)); + + // Verse text with unpaired closing quotation mark + actualAction = RunFirstPassOnChapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + "He said to the woman, Has God really said, " + + "You shall not eat of any tree of the garden?”" + ], + "standard_english", + "standard_english" + ); + expectedAction = QuotationMarkUpdateStrategy.ApplyFallback; + Assert.That(actualAction, Is.EqualTo(expectedAction)); + + // Verse text with too deeply nested quotation marks + actualAction = RunFirstPassOnChapter( + [ + "“Now the serpent was more “subtle than any animal " + + "of the “field which “Yahweh God had made. " + + "He said to the woman, “Has God really said, " + + "“You shall not eat of any tree of the garden?" + ], + "standard_english", + "standard_english" + ); + expectedAction = QuotationMarkUpdateStrategy.ApplyFallback; + Assert.That(actualAction, Is.EqualTo(expectedAction)); + + // Verse text with an ambiguous quotation mark + actualAction = RunFirstPassOnChapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + "He said to the woman\"Has God really said, " + + "You shall not eat of any tree of the garden?" + ], + "typewriter_english", + "standard_english" + ); + expectedAction = QuotationMarkUpdateStrategy.Skip; + Assert.That(actualAction, Is.EqualTo(expectedAction)); + + // Verse text with an ambiguous quotation mark + actualAction = RunFirstPassOnChapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + "He said to the woman\"Has God really said, " + + "You shall not eat of any tree of the garden?" + ], + "typewriter_english", + "standard_english" + ); + expectedAction = QuotationMarkUpdateStrategy.Skip; + Assert.That(actualAction, Is.EqualTo(expectedAction)); + + // Verse text with too deeply nested ambiguous quotation marks + actualAction = RunFirstPassOnChapter( + [ + "\"Now the serpent was more \"subtle than any animal " + + "of the \"field which \"Yahweh God had made. " + + "He said to the woman, \"Has God really said, " + + "\"You shall not eat of any tree of the garden?" + ], + "typewriter_english", + "standard_english" + ); + expectedAction = QuotationMarkUpdateStrategy.Skip; + Assert.That(actualAction, Is.EqualTo(expectedAction)); + } + + [Test] + public void ChooseBestActionBasedOnObservedIssues() + { + var firstPassAnalyzer = new QuotationMarkUpdateFirstPass( + new QuoteConvention("", []), + new QuoteConvention("", []) + ); + firstPassAnalyzer.WillFallbackModeWork = false; + + // Test with no issue + QuotationMarkUpdateStrategy bestAction = firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues([]); + Assert.That(bestAction, Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFull)); + + // Test with one issue + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [QuotationMarkResolutionIssue.UnpairedQuotationMark] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [QuotationMarkResolutionIssue.AmbiguousQuotationMark] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues([QuotationMarkResolutionIssue.TooDeepNesting]), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + + // Test with multiple issues + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [QuotationMarkResolutionIssue.TooDeepNesting, QuotationMarkResolutionIssue.AmbiguousQuotationMark,] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [ + QuotationMarkResolutionIssue.UnpairedQuotationMark, + QuotationMarkResolutionIssue.AmbiguousQuotationMark, + ] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [QuotationMarkResolutionIssue.TooDeepNesting, QuotationMarkResolutionIssue.UnpairedQuotationMark,] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + } + + [Test] + public void ChooseBestActionBasedOnObservedIssuesWithBasicFallback() + { + var firstPassAnalyzer = new QuotationMarkUpdateFirstPass( + new QuoteConvention("", []), + new QuoteConvention("", []) + ); + firstPassAnalyzer.WillFallbackModeWork = true; + + // Test with no issues + QuotationMarkUpdateStrategy bestAction = firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues([]); + Assert.That(bestAction, Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFull)); + + // Test with one issue + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [QuotationMarkResolutionIssue.UnpairedQuotationMark] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFallback) + ); + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [QuotationMarkResolutionIssue.AmbiguousQuotationMark] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues([QuotationMarkResolutionIssue.TooDeepNesting]), + Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFallback) + ); + + // Test with multiple issues + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [ + QuotationMarkResolutionIssue.AmbiguousQuotationMark, + QuotationMarkResolutionIssue.UnpairedQuotationMark, + ] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [QuotationMarkResolutionIssue.AmbiguousQuotationMark, QuotationMarkResolutionIssue.TooDeepNesting,] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.Skip) + ); + Assert.That( + firstPassAnalyzer.ChooseBestStrategyBasedOnObservedIssues( + [QuotationMarkResolutionIssue.TooDeepNesting, QuotationMarkResolutionIssue.UnpairedQuotationMark,] + ), + Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFallback) + ); + + // tests of getBestActionsByChapter() + } + + [Test] + public void NoIssuesInUsfm() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + "; + List expectedActions = [QuotationMarkUpdateStrategy.ApplyFull]; + List observedActions = RunFirstPass( + normalizedUsfm, + "standard_english", + "standard_english" + ); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void UnpairedOpeningMark() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’? + "; + List expectedActions = [QuotationMarkUpdateStrategy.ApplyFallback]; + List observedActions = RunFirstPass( + normalizedUsfm, + "standard_english", + "standard_english" + ); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void UnpairedClosingMark() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, Has God really said, + You shall not eat of any tree of the garden?” + "; + List expectedActions = [QuotationMarkUpdateStrategy.ApplyFallback]; + List observedActions = RunFirstPass( + normalizedUsfm, + "standard_english", + "standard_english" + ); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void TooDeepNesting() + { + string normalizedUsfm = + @"\c 1 + \v 1 “Now the serpent was more “subtle than any animal + of the “field which “Yahweh God had made. + He said to the woman, “Has God really said, + “You shall not eat of any tree of the garden? + "; + List expectedActions = [QuotationMarkUpdateStrategy.ApplyFallback]; + List observedActions = RunFirstPass( + normalizedUsfm, + "standard_english", + "standard_english" + ); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void AmbiguousQuotationMark() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman""Has God really said, + You shall not eat of any tree of the garden? + "; + List expectedActions = [QuotationMarkUpdateStrategy.Skip]; + List observedActions = RunFirstPass( + normalizedUsfm, + "typewriter_english", + "standard_english" + ); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void NoIssuesInMultipleChapters() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + \c 2 \v 1 He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.ApplyFull, + QuotationMarkUpdateStrategy.ApplyFull + ]; + List observedActions = RunFirstPass( + normalizedUsfm, + "standard_english", + "standard_english" + ); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void UnpairedQuotationMarkInSecondChapter() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + \c 2 \v 1 He said to the woman, Has God really said, + You shall not eat of any tree of the garden?” + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.ApplyFull, + QuotationMarkUpdateStrategy.ApplyFallback + ]; + List observedActions = RunFirstPass( + normalizedUsfm, + "standard_english", + "standard_english" + ); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void UnpairedQuotationMarkInFirstChapter() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had” made. + \c 2 \v 1 He said to the woman, Has God really said, + “You shall not eat of any tree of the garden?” + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.ApplyFallback, + QuotationMarkUpdateStrategy.ApplyFull + ]; + List observedActions = RunFirstPass( + normalizedUsfm, + "standard_english", + "standard_english" + ); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void AmbiguousQuotationMarkInSecondChapter() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + \c 2 \v 1 He said to the woman, Has God really said, + You shall not""eat of any tree of the garden?"" + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.ApplyFull, + QuotationMarkUpdateStrategy.Skip + ]; + List observedActions = RunFirstPass( + normalizedUsfm, + "typewriter_english", + "standard_english" + ); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void AmbiguousQuotationMarkInFirstChapter() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field""which Yahweh God had made. + \c 2 \v 1 He said to the woman, Has God really said, + ""You shall not eat of any tree of the garden?"" + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.Skip, + QuotationMarkUpdateStrategy.ApplyFull + ]; + List observedActions = RunFirstPass( + normalizedUsfm, + "typewriter_english", + "standard_english" + ); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void UnpairedQuotationMarkInBothChapters() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had” made. + \c 2 \v 1 He said to the woman, Has God really said, + You shall not eat of any tree of the garden?” + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.ApplyFallback, + QuotationMarkUpdateStrategy.ApplyFallback + ]; + List observedActions = RunFirstPass( + normalizedUsfm, + "standard_english", + "standard_english" + ); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void AmbiguousQuotationMarkInBothChapters() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had""made. + \c 2 \v 1 He said to the woman, Has God really said, + You shall not eat of any""tree of the garden? + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.Skip, + QuotationMarkUpdateStrategy.Skip + ]; + List observedActions = RunFirstPass( + normalizedUsfm, + "typewriter_english", + "standard_english" + ); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void UnpairedInFirstAmbiguousInSecond() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made."" + \c 2 \v 1 He said to the woman, Has God really said, + You shall not eat of any""tree of the garden? + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.ApplyFallback, + QuotationMarkUpdateStrategy.Skip + ]; + List observedActions = RunFirstPass( + normalizedUsfm, + "typewriter_english", + "standard_english" + ); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + [Test] + public void AmbiguousInFirstUnpairedInSecond() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God""had made. + \c 2 \v 1 He said to the woman, Has God really said, + You shall not eat of any tree of the garden ? "" + "; + List expectedActions = + [ + QuotationMarkUpdateStrategy.Skip, + QuotationMarkUpdateStrategy.ApplyFallback + ]; + List observedActions = RunFirstPass( + normalizedUsfm, + "typewriter_english", + "standard_english" + ); + + Assert.That(expectedActions.SequenceEqual(observedActions)); + } + + public List RunFirstPass( + string normalizedUsfm, + string sourceQuoteConventionName, + string targetQuoteConventionName + ) + { + QuoteConvention sourceQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + sourceQuoteConventionName + ); + Assert.IsNotNull(sourceQuoteConvention); + + QuoteConvention targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + targetQuoteConventionName + ); + Assert.IsNotNull(targetQuoteConvention); + + var firstPassAnalyzer = new QuotationMarkUpdateFirstPass(sourceQuoteConvention, targetQuoteConvention); + UsfmParser.Parse(normalizedUsfm, firstPassAnalyzer); + + return firstPassAnalyzer.FindBestChapterStrategies(); + } + + public QuotationMarkUpdateStrategy RunFirstPassOnChapter( + List verseTexts, + string sourceQuoteConventionName, + string targetQuoteConventionName + ) + { + QuoteConvention sourceQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + sourceQuoteConventionName + ); + Assert.IsNotNull(sourceQuoteConvention); + + QuoteConvention targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + targetQuoteConventionName + ); + Assert.IsNotNull(targetQuoteConvention); + + var firstPassAnalyzer = new QuotationMarkUpdateFirstPass(sourceQuoteConvention, targetQuoteConvention); + + var chapter = new Chapter( + verseTexts.Select(verseText => new Verse([new TextSegment.Builder().SetText(verseText).Build()])).ToList() + ); + + return firstPassAnalyzer.FindBestStrategyForChapter(chapter); + } + + public QuoteConvention GetQuoteConventionByName(string name) + { + QuoteConvention quoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(name); + Assert.IsNotNull(quoteConvention); + return quoteConvention; + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs new file mode 100644 index 00000000..af5a264e --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/QuoteConventionChangingUsfmBlockUpdateHandlerTests.cs @@ -0,0 +1,1018 @@ +using NUnit.Framework; +using SIL.Machine.PunctuationAnalysis; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class QuoteConventionChangingUsfmUpdateBlockHandlerTests +{ + [Test] + public void QuotesSpanningVerses() + { + string inputUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + \v 2 “You shall not eat of any tree of the garden”?» + "; + + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, \n" + + "\\v 2 ‘You shall not eat of any tree of the garden’?”" + ); + + string observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void SingleEmbed() + { + string inputUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + \f + \ft «This is a “footnote”» \f* + of the field which Yahweh God had made. + "; + + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal " + + "\\f + \\ft “This is a ‘footnote’” \\f* of the field which Yahweh God had made." + ); + + string observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void MultipleEmbeds() + { + string inputUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + \f + \ft «This is a “footnote”» \f* + of the field \f + \ft Second «footnote» here \f* which Yahweh God had made. + "; + + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal " + + "\\f + \\ft “This is a ‘footnote’” \\f* of the field \\f + \\ft Second " + + "“footnote” here \\f* which Yahweh God had made." + ); + + string observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void QuotesInTextAndEmbed() + { + string inputUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really \f + \ft a + «footnote» in the «midst of “text”» \f* said, + “You shall not eat of any tree of the garden”?» + "; + + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really \\f + \\ft a “footnote” in the “midst of ‘text’” \\f* " + + "said, ‘You shall not eat of any tree of the garden’?”" + ); + + string observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void QuotesInMultipleVersesAndEmbed() + { + string inputUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God + \v 2 really \f + \ft a + «footnote» in the «midst of “text”» \f* said, + “You shall not eat of any tree of the garden”?» + "; + + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God\n" + + "\\v 2 really \\f + \\ft a “footnote” in the “midst of ‘text’” \\f* " + + "said, ‘You shall not eat of any tree of the garden’?”" + ); + + string observedUsfm = ChangeQuotationMarks(inputUsfm, "western_european", "standard_english"); + AssertUsfmEqual(observedUsfm, expectedUsfm); + + // Fallback mode does not consider the nesting of quotation marks, + // but only determines opening/closing marks and maps based on that. + } + + [Test] + public void FallbackStrategySameAsFull() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ‘Has God really said, + “You shall not eat of any tree of the garden”?’ + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" + ); + + string observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "british_english", + "standard_english", + new QuotationMarkUpdateSettings(defaultChapterStrategy: QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void FallbackStrategyIncorrectlyNested() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ‘Has God really said, + ‘You shall not eat of any tree of the garden’?’ + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, “You shall not eat of any tree of the garden”?”" + ); + + string observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "british_english", + "standard_english", + new QuotationMarkUpdateSettings(defaultChapterStrategy: QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void FallbackStrategyIncorrectlyNestedSecondCase() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?’ + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?”" + ); + + string observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "british_english", + "standard_english", + new QuotationMarkUpdateSettings(defaultChapterStrategy: QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void FallbackStrategyUnclosedQuote() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ‘Has God really said, + You shall not eat of any tree of the garden”?’ + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ); + + string observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "british_english", + "standard_english", + new QuotationMarkUpdateSettings(defaultChapterStrategy: QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void DefaultQuotationMarkUpdateStrategy() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + You shall not eat of any tree of the garden'?"" + "; + string expectedFullUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ); + + string expectedBasicUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ); + + string expectedSkippedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, \"Has God really said, You shall not eat of any tree of the garden\'?\"" + ); + + string observedUsfm = ChangeQuotationMarks(normalizedUsfm, "typewriter_english", "standard_english"); + AssertUsfmEqual(observedUsfm, expectedFullUsfm); + + observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings(defaultChapterStrategy: QuotationMarkUpdateStrategy.ApplyFull) + ); + AssertUsfmEqual(observedUsfm, expectedFullUsfm); + + observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings(defaultChapterStrategy: QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedBasicUsfm); + + observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings(defaultChapterStrategy: QuotationMarkUpdateStrategy.Skip) + ); + AssertUsfmEqual(observedUsfm, expectedSkippedUsfm); + } + + [Test] + public void SingleChapterQuotationMarkUpdateStrategy() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + You shall not eat of any tree of the garden'?"" + "; + string expectedFullUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ); + + string expectedBasicUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ); + + string expectedSkippedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, \"Has God really said, You shall not eat of any tree of the garden\'?\"" + ); + + string observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings(chapterStrategies: [QuotationMarkUpdateStrategy.ApplyFull]) + ); + AssertUsfmEqual(observedUsfm, expectedFullUsfm); + + observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings(chapterStrategies: [QuotationMarkUpdateStrategy.ApplyFallback]) + ); + AssertUsfmEqual(observedUsfm, expectedBasicUsfm); + + observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings(chapterStrategies: [QuotationMarkUpdateStrategy.Skip]) + ); + AssertUsfmEqual(observedUsfm, expectedSkippedUsfm); + } + + [Test] + public void MultipleChapterSameStrategy() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle"" than any animal + of the field which Yahweh God had made. + \c 2 + \v 1 He said to the woman, ""Has God really said, + You shall not eat of any tree of the garden'?"" + "; + string expectedFullUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle\" than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ); + + string expectedFallbackUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ); + + string observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings( + chapterStrategies: [QuotationMarkUpdateStrategy.ApplyFull, QuotationMarkUpdateStrategy.ApplyFull] + ) + ); + AssertUsfmEqual(observedUsfm, expectedFullUsfm); + + observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings( + chapterStrategies: + [ + QuotationMarkUpdateStrategy.ApplyFallback, + QuotationMarkUpdateStrategy.ApplyFallback + ] + ) + ); + AssertUsfmEqual(observedUsfm, expectedFallbackUsfm); + } + + [Test] + public void MultipleChapterMultipleStrategies() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle"" than any animal + of the field which Yahweh God had made. + \c 2 + \v 1 He said to the woman, ""Has God really said, + You shall not eat of any tree of the garden'?"" + "; + string expectedFullThenFallbackUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle\" than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ); + + string expectedFallbackThenFullUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ); + + string expectedFallbackThenSkipUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + "\\v 1 He said to the woman, \"Has God really said, You shall not eat of any tree of the garden\'?\"" + ); + + string observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings( + chapterStrategies: [QuotationMarkUpdateStrategy.ApplyFull, QuotationMarkUpdateStrategy.ApplyFallback] + ) + ); + AssertUsfmEqual(observedUsfm, expectedFullThenFallbackUsfm); + + observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings( + chapterStrategies: [QuotationMarkUpdateStrategy.ApplyFallback, QuotationMarkUpdateStrategy.ApplyFull] + ) + ); + AssertUsfmEqual(observedUsfm, expectedFallbackThenFullUsfm); + + observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_english", + "standard_english", + new QuotationMarkUpdateSettings( + chapterStrategies: [QuotationMarkUpdateStrategy.ApplyFallback, QuotationMarkUpdateStrategy.Skip] + ) + ); + AssertUsfmEqual(observedUsfm, expectedFallbackThenSkipUsfm); + } + + [Test] + public void MultiCharacterQuotationMarksInSourceQuoteConvention() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <?>> + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" + ); + + string observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "typewriter_french", + "standard_english", + new QuotationMarkUpdateSettings(defaultChapterStrategy: QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void MultiCharacterQuotationMarksInTargetQuoteConvention() + { + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + "; + string expectedUsfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, <?>>" + ); + + string observedUsfm = ChangeQuotationMarks( + normalizedUsfm, + "standard_english", + "typewriter_french", + new QuotationMarkUpdateSettings(defaultChapterStrategy: QuotationMarkUpdateStrategy.ApplyFallback) + ); + AssertUsfmEqual(observedUsfm, expectedUsfm); + } + + [Test] + public void ProcessScriptureElement() + { + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "british_english") + ); + var quotationMarkFinder = new MockQuotationMarkFinder(); + quoteConventionChanger.InternalQuotationMarkFinder = quotationMarkFinder; + + var updateElement = new UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.Text, + tokens: [new UsfmToken("test segment")] + ); + var mockQuotationMarkResolver = new MockQuotationMarkResolver(); + quoteConventionChanger.InternalProcessScriptureElement(updateElement, mockQuotationMarkResolver); + + Assert.That(quotationMarkFinder.NumTimesCalled, Is.EqualTo(1)); + Assert.That(mockQuotationMarkResolver.NumTimesCalled, Is.EqualTo(1)); + Assert.That(quotationMarkFinder.MatchesToReturn[0].TextSegment.Text, Is.EqualTo("this is a ‘test")); + Assert.That(quotationMarkFinder.MatchesToReturn[1].TextSegment.Text, Is.EqualTo("the test ends” here")); + } + + [Test] + public void CreateTextSegmentsBasic() + { + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") + ); + + var updateElement = new UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.Text, + tokens: [new UsfmToken("test segment")] + ); + List textSegments = quoteConventionChanger.InternalCreateTextSegments(updateElement); + + Assert.That(textSegments, Has.Count.EqualTo(1)); + Assert.That(textSegments[0].Text, Is.EqualTo("test segment")); + Assert.That(textSegments[0].ImmediatePrecedingMarker, Is.EqualTo(UsfmMarkerType.NoMarker)); + Assert.That(textSegments[0].MarkersInPrecedingContext, Has.Count.EqualTo(0)); + Assert.IsNull(textSegments[0].PreviousSegment); + Assert.IsNull(textSegments[0].NextSegment); + } + + [Test] + public void CreateTextSegmentsWithPrecedingMarkers() + { + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") + ); + + var updateElement = new UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.Text, + tokens: + [ + new UsfmToken(UsfmTokenType.Verse, null, null, null), + new UsfmToken(UsfmTokenType.Paragraph, null, null, null), + new UsfmToken("test segment"), + ] + ); + List textSegments = quoteConventionChanger.InternalCreateTextSegments(updateElement); + + Assert.That(textSegments, Has.Count.EqualTo(1)); + Assert.That(textSegments[0].Text, Is.EqualTo("test segment")); + Assert.That(textSegments[0].ImmediatePrecedingMarker, Is.EqualTo(UsfmMarkerType.Paragraph)); + Assert.That( + textSegments[0].MarkersInPrecedingContext.SequenceEqual([UsfmMarkerType.Verse, UsfmMarkerType.Paragraph,]) + ); + Assert.IsNull(textSegments[0].PreviousSegment); + Assert.IsNull(textSegments[0].NextSegment); + } + + [Test] + public void CreateTextSegmentsWithMultipleTextTokens() + { + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") + ); + + var updateElement = new UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.Text, + tokens: + [ + new UsfmToken(UsfmTokenType.Verse, null, null, null), + new UsfmToken(UsfmTokenType.Paragraph, null, null, null), + new UsfmToken("test segment1"), + new UsfmToken(UsfmTokenType.Verse, null, null, null), + new UsfmToken(UsfmTokenType.Character, null, null, null), + new UsfmToken("test segment2"), + new UsfmToken(UsfmTokenType.Paragraph, null, null, null), + ] + ); + List textSegments = quoteConventionChanger.InternalCreateTextSegments(updateElement); + + Assert.That(textSegments, Has.Count.EqualTo(2)); + Assert.That(textSegments[0].Text, Is.EqualTo("test segment1")); + Assert.That(textSegments[0].ImmediatePrecedingMarker, Is.EqualTo(UsfmMarkerType.Paragraph)); + Assert.That( + textSegments[0].MarkersInPrecedingContext.SequenceEqual([UsfmMarkerType.Verse, UsfmMarkerType.Paragraph,]) + ); + Assert.IsNull(textSegments[0].PreviousSegment); + Assert.That(textSegments[0].NextSegment, Is.EqualTo(textSegments[1])); + Assert.That(textSegments[1].Text, Is.EqualTo("test segment2")); + Assert.That(textSegments[1].ImmediatePrecedingMarker, Is.EqualTo(UsfmMarkerType.Character)); + Assert.That( + textSegments[1].MarkersInPrecedingContext.SequenceEqual([UsfmMarkerType.Verse, UsfmMarkerType.Character,]) + ); + Assert.That(textSegments[1].PreviousSegment, Is.EqualTo(textSegments[0])); + Assert.IsNull(textSegments[1].NextSegment); + } + + [Test] + public void CreateTextSegment() + { + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") + ); + + var usfmToken = new UsfmToken("test segment"); + TextSegment segment = quoteConventionChanger.InternalCreateTextSegment(usfmToken); + + Assert.IsNotNull(segment); + Assert.That(segment.Text, Is.EqualTo("test segment")); + Assert.That(segment.ImmediatePrecedingMarker, Is.EqualTo(UsfmMarkerType.NoMarker)); + Assert.That(segment.MarkersInPrecedingContext, Has.Count.EqualTo(0)); + Assert.That(segment.UsfmToken, Is.EqualTo(usfmToken)); + } + + [Test] + public void SetPreviousAndNextForSegments() + { + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") + ); + + List segments = + [ + new TextSegment.Builder().SetText("segment 1 text").Build(), + new TextSegment.Builder().SetText("segment 2 text").Build(), + new TextSegment.Builder().SetText("segment 3 text").Build() + ]; + + quoteConventionChanger.InternalSetPreviousAndNextForSegments(segments); + + Assert.IsNull(segments[0].PreviousSegment); + Assert.That(segments[0].NextSegment, Is.EqualTo(segments[1])); + Assert.That(segments[1].PreviousSegment, Is.EqualTo(segments[0])); + Assert.That(segments[1].NextSegment, Is.EqualTo(segments[2])); + Assert.That(segments[2].PreviousSegment, Is.EqualTo(segments[1])); + Assert.IsNull(segments[2].NextSegment); + } + + [Test] + public void UpdateQuotationMarks() + { + QuoteConventionChangingUsfmUpdateBlockHandler multiCharToSingleCharQuoteConventionChanger = + CreateQuoteConventionChangingUsfmUpdateBlockHandler("typewriter_french", "standard_english"); + + TextSegment multiCharacterTextSegment = new TextSegment.Builder() + .SetText("this < >>") + .Build(); + + List multiCharacterQuotationMarks = + [ + new QuotationMarkMetadata( + quotationMark: "<<", + depth: 1, + direction: QuotationMarkDirection.Opening, + textSegment: multiCharacterTextSegment, + startIndex: 5, + endIndex: 7 + ), + new QuotationMarkMetadata( + quotationMark: "<", + depth: 2, + direction: QuotationMarkDirection.Opening, + textSegment: multiCharacterTextSegment, + startIndex: 10, + endIndex: 11 + ), + new QuotationMarkMetadata( + quotationMark: ">", + depth: 2, + direction: QuotationMarkDirection.Closing, + textSegment: multiCharacterTextSegment, + startIndex: 25, + endIndex: 26 + ), + new QuotationMarkMetadata( + quotationMark: ">>", + depth: 1, + direction: QuotationMarkDirection.Closing, + textSegment: multiCharacterTextSegment, + startIndex: 27, + endIndex: 29 + ) + ]; + + multiCharToSingleCharQuoteConventionChanger.UpdateQuotationMarks(multiCharacterQuotationMarks); + + Assert.That(multiCharacterTextSegment.Text, Is.EqualTo("this “is ‘a test segment’ ”")); + Assert.That(multiCharacterQuotationMarks[0].StartIndex, Is.EqualTo(5)); + Assert.That(multiCharacterQuotationMarks[0].EndIndex, Is.EqualTo(6)); + Assert.That(multiCharacterQuotationMarks[0].TextSegment, Is.EqualTo(multiCharacterTextSegment)); + Assert.That(multiCharacterQuotationMarks[1].StartIndex, Is.EqualTo(9)); + Assert.That(multiCharacterQuotationMarks[1].EndIndex, Is.EqualTo(10)); + Assert.That(multiCharacterQuotationMarks[1].TextSegment, Is.EqualTo(multiCharacterTextSegment)); + Assert.That(multiCharacterQuotationMarks[2].StartIndex, Is.EqualTo(24)); + Assert.That(multiCharacterQuotationMarks[2].EndIndex, Is.EqualTo(25)); + Assert.That(multiCharacterQuotationMarks[2].TextSegment, Is.EqualTo(multiCharacterTextSegment)); + Assert.That(multiCharacterQuotationMarks[3].StartIndex, Is.EqualTo(26)); + Assert.That(multiCharacterQuotationMarks[3].EndIndex, Is.EqualTo(27)); + Assert.That(multiCharacterQuotationMarks[3].TextSegment, Is.EqualTo(multiCharacterTextSegment)); + + QuoteConventionChangingUsfmUpdateBlockHandler singleCharToMultiCharQuoteConventionChanger = + CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "typewriter_french"); + + TextSegment singleCharacterTextSegment = new TextSegment.Builder() + .SetText("this “is ‘a test segment’ ”") + .Build(); + + List singleCharacterQuotationMarks = + [ + new QuotationMarkMetadata( + quotationMark: "“", + depth: 1, + direction: QuotationMarkDirection.Opening, + textSegment: singleCharacterTextSegment, + startIndex: 5, + endIndex: 6 + ), + new QuotationMarkMetadata( + quotationMark: "‘", + depth: 2, + direction: QuotationMarkDirection.Opening, + textSegment: singleCharacterTextSegment, + startIndex: 9, + endIndex: 10 + ), + new QuotationMarkMetadata( + quotationMark: "’", + depth: 2, + direction: QuotationMarkDirection.Closing, + textSegment: singleCharacterTextSegment, + startIndex: 24, + endIndex: 25 + ), + new QuotationMarkMetadata( + quotationMark: "”", + depth: 1, + direction: QuotationMarkDirection.Closing, + textSegment: singleCharacterTextSegment, + startIndex: 26, + endIndex: 27 + ) + ]; + + singleCharToMultiCharQuoteConventionChanger.UpdateQuotationMarks(singleCharacterQuotationMarks); + + Assert.That(singleCharacterTextSegment.Text, Is.EqualTo("this < >>")); + Assert.That(singleCharacterQuotationMarks[0].StartIndex, Is.EqualTo(5)); + Assert.That(singleCharacterQuotationMarks[0].EndIndex, Is.EqualTo(7)); + Assert.That(singleCharacterQuotationMarks[0].TextSegment, Is.EqualTo(singleCharacterTextSegment)); + Assert.That(singleCharacterQuotationMarks[1].StartIndex, Is.EqualTo(10)); + Assert.That(singleCharacterQuotationMarks[1].EndIndex, Is.EqualTo(11)); + Assert.That(singleCharacterQuotationMarks[1].TextSegment, Is.EqualTo(singleCharacterTextSegment)); + Assert.That(singleCharacterQuotationMarks[2].StartIndex, Is.EqualTo(25)); + Assert.That(singleCharacterQuotationMarks[2].EndIndex, Is.EqualTo(26)); + Assert.That(singleCharacterQuotationMarks[2].TextSegment, Is.EqualTo(singleCharacterTextSegment)); + Assert.That(singleCharacterQuotationMarks[3].StartIndex, Is.EqualTo(27)); + Assert.That(singleCharacterQuotationMarks[3].EndIndex, Is.EqualTo(29)); + Assert.That(singleCharacterQuotationMarks[3].TextSegment, Is.EqualTo(singleCharacterTextSegment)); + } + + [Test] + public void CheckForChapterChange() + { + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler("standard_english", "standard_english") + ); + + Assert.That(quoteConventionChanger.InternalCurrentChapterNumber, Is.EqualTo(0)); + + quoteConventionChanger.InternalCheckForChapterChange(new UsfmUpdateBlock([ScriptureRef.Parse("MAT 1:1")], [])); + + Assert.That(quoteConventionChanger.InternalCurrentChapterNumber, Is.EqualTo(1)); + + quoteConventionChanger.InternalCheckForChapterChange( + new UsfmUpdateBlock([ScriptureRef.Parse("ISA 15:22")], []) + ); + + Assert.That(quoteConventionChanger.InternalCurrentChapterNumber, Is.EqualTo(15)); + } + + [Test] + public void StartNewChapter() + { + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler( + "standard_english", + "standard_english", + new QuotationMarkUpdateSettings( + chapterStrategies: + [ + QuotationMarkUpdateStrategy.Skip, + QuotationMarkUpdateStrategy.ApplyFull, + QuotationMarkUpdateStrategy.ApplyFallback, + ] + ) + ) + ); + + quoteConventionChanger.InternalVerseTextQuotationMarkResolver = new MockQuotationMarkResolver(); + + quoteConventionChanger + .InternalNextScriptureTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed) + .SetText("this text should be erased"); + quoteConventionChanger.InternalVerseTextQuotationMarkResolver.InternalIssues.Add( + QuotationMarkResolutionIssue.IncompatibleQuotationMark + ); + + quoteConventionChanger.InternalStartNewChapter(1); + TextSegment segment = quoteConventionChanger.InternalNextScriptureTextSegmentBuilder.Build(); + Assert.That(quoteConventionChanger.InternalCurrentStrategy, Is.EqualTo(QuotationMarkUpdateStrategy.Skip)); + Assert.That(segment.ImmediatePrecedingMarker, Is.EqualTo(UsfmMarkerType.Chapter)); + Assert.That(segment.Text, Is.EqualTo("")); + Assert.That(!segment.MarkersInPrecedingContext.Contains(UsfmMarkerType.Embed)); + Assert.That(quoteConventionChanger.InternalVerseTextQuotationMarkResolver.InternalIssues, Has.Count.EqualTo(0)); + + quoteConventionChanger.InternalStartNewChapter(2); + Assert.That(quoteConventionChanger.InternalCurrentStrategy, Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFull)); + + quoteConventionChanger.InternalStartNewChapter(3); + Assert.That( + quoteConventionChanger.InternalCurrentStrategy, + Is.EqualTo(QuotationMarkUpdateStrategy.ApplyFallback) + ); + } + + private static string ChangeQuotationMarks( + string normalizedUsfm, + string sourceQuoteConventionName, + string targetQuoteConventionName, + QuotationMarkUpdateSettings? quotationMarkUpdateSettings = null + ) + { + quotationMarkUpdateSettings ??= new QuotationMarkUpdateSettings(); + MockQuoteConventionChangingUsfmUpdateBlockHandler quoteConventionChanger = ( + CreateQuoteConventionChangingUsfmUpdateBlockHandler( + sourceQuoteConventionName, + targetQuoteConventionName, + quotationMarkUpdateSettings + ) + ); + + var updater = new UpdateUsfmParserHandler(updateBlockHandlers: [quoteConventionChanger]); + UsfmParser.Parse(normalizedUsfm, updater); + + return updater.GetUsfm(); + } + + private static MockQuoteConventionChangingUsfmUpdateBlockHandler CreateQuoteConventionChangingUsfmUpdateBlockHandler( + string sourceQuoteConventionName, + string targetQuoteConventionName, + QuotationMarkUpdateSettings? quotationMarkUpdateSettings = null + ) + { + quotationMarkUpdateSettings ??= new QuotationMarkUpdateSettings(); + QuoteConvention sourceQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + sourceQuoteConventionName + ); + Assert.IsNotNull(sourceQuoteConvention); + + QuoteConvention targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + targetQuoteConventionName + ); + Assert.IsNotNull(targetQuoteConvention); + + return new MockQuoteConventionChangingUsfmUpdateBlockHandler( + sourceQuoteConvention, + targetQuoteConvention, + quotationMarkUpdateSettings + ); + } + + private static void AssertUsfmEqual(string observedUsfm, string expectedUsfm) + { + foreach ((string observedLine, string expectedLine) in observedUsfm.Split("\n").Zip(expectedUsfm.Split("\n"))) + Assert.That(observedLine.Trim(), Is.EqualTo(expectedLine.Trim())); + } + + private class MockQuoteConventionChangingUsfmUpdateBlockHandler( + QuoteConvention sourceQuoteConvention, + QuoteConvention targetQuoteConvention, + QuotationMarkUpdateSettings settings + ) : QuoteConventionChangingUsfmUpdateBlockHandler(sourceQuoteConvention, targetQuoteConvention, settings) + { + public QuotationMarkFinder InternalQuotationMarkFinder + { + set => QuotationMarkFinder = value; + } + + public TextSegment.Builder InternalNextScriptureTextSegmentBuilder + { + get => NextScriptureTextSegmentBuilder; + } + public MockQuotationMarkResolver InternalVerseTextQuotationMarkResolver + { + get => + VerseTextQuotationMarkResolver is MockQuotationMarkResolver mqmr + ? mqmr + : throw new InvalidOperationException( + "Unable to use implementations of IQuotationMarkResolver other than MockQuotationMarkResolver" + ); + set => VerseTextQuotationMarkResolver = value; + } + public int InternalCurrentChapterNumber + { + get => CurrentChapterNumber; + set => CurrentChapterNumber = value; + } + public QuotationMarkUpdateStrategy InternalCurrentStrategy + { + get => CurrentStrategy; + set => CurrentStrategy = value; + } + + public void InternalProcessScriptureElement( + UsfmUpdateBlockElement element, + IQuotationMarkResolver quotationMarkResolver + ) + { + ProcessScriptureElement(element, quotationMarkResolver); + } + + public List InternalCreateTextSegments(UsfmUpdateBlockElement element) + { + return CreateTextSegments(element); + } + + public TextSegment InternalCreateTextSegment(UsfmToken usfmToken) + { + return CreateTextSegment(usfmToken); + } + + public List InternalSetPreviousAndNextForSegments(List textSegments) + { + return SetPreviousAndNextForSegments(textSegments); + } + + public void InternalStartNewChapter(int newChapterNum) + { + StartNewChapter(newChapterNum); + } + + public void InternalCheckForChapterChange(UsfmUpdateBlock block) + { + CheckForChapterChange(block); + } + } + + private class MockQuotationMarkFinder : QuotationMarkFinder + { + public int NumTimesCalled; + public readonly List MatchesToReturn; + + public MockQuotationMarkFinder() + : base(new QuoteConventionSet([])) + { + NumTimesCalled = 0; + MatchesToReturn = + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("this is a \"test").Build(), 10, 11), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("the test ends\" here").Build(), 13, 14), + ]; + } + + public override List FindAllPotentialQuotationMarksInTextSegments( + IReadOnlyList textSegments + ) + { + NumTimesCalled++; + return MatchesToReturn; + } + } + + private class MockQuotationMarkResolver(IQuotationMarkResolutionSettings? settings = null) + : DepthBasedQuotationMarkResolver( + settings ?? new QuoteConventionDetectionResolutionSettings(new QuoteConventionSet([])) + ) + { + public int NumTimesCalled = 0; + + public HashSet InternalIssues => Issues; + + public override void Reset() + { + base.Reset(); + NumTimesCalled = 0; + } + + public override IEnumerable ResolveQuotationMarks( + IReadOnlyList quoteMatches + ) + { + NumTimesCalled++; + int currentDepth = 1; + QuotationMarkDirection currentDirection = QuotationMarkDirection.Opening; + foreach (QuotationMarkStringMatch quoteMatch in quoteMatches) + { + yield return quoteMatch.Resolve(currentDepth, currentDirection); + currentDepth++; + currentDirection = + currentDirection == QuotationMarkDirection.Opening + ? QuotationMarkDirection.Closing + : QuotationMarkDirection.Opening; + } + } + + public override HashSet GetIssues() + { + return new HashSet(); + } + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs index 9430621b..cb1092a6 100644 --- a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs @@ -9,9 +9,9 @@ public class UpdateUsfmParserHandlerTests [Test] public void GetUsfm_Verse_CharStyle() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:1"), "First verse of the first chapter.") + new UpdateUsfmRow(ScrRef("MAT 1:1"), "First verse of the first chapter.") }; string target = UpdateUsfm(rows); @@ -34,12 +34,12 @@ public void GetUsfm_IdText() [Test] public void GetUsfm_StripAllText() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:1"), "Update 1"), - (ScrRef("MAT 1:3"), "Update 3") + new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), + new UpdateUsfmRow(ScrRef("MAT 1:3"), "Update 3") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \r keep this reference @@ -60,7 +60,7 @@ public void GetUsfm_StripAllText() styleBehavior: UpdateUsfmMarkerBehavior.Preserve ); - var result = + string result = @"\id MAT \c 1 \r keep this reference @@ -103,11 +103,11 @@ public void GetUsfm_StripAllText() [Test] public void GetUsfm_StripParagraphs_PreserveParagraphStyles() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:0/1:rem"), "New remark"), - (ScrRef("MAT 1:0/3:ip"), "Another new remark"), - (ScrRef("MAT 1:1"), "Update 1"), + new UpdateUsfmRow(ScrRef("MAT 1:0/1:rem"), "New remark"), + new UpdateUsfmRow(ScrRef("MAT 1:0/3:ip"), "Another new remark"), + new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), }; string usfm = @"\id MAT @@ -135,7 +135,7 @@ public void GetUsfm_StripParagraphs_PreserveParagraphStyles() AssertUsfmEquals(target, result); - var targetDiffParagraph = UpdateUsfm( + string targetDiffParagraph = UpdateUsfm( rows, usfm, textBehavior: UpdateUsfmTextBehavior.StripExisting, @@ -157,10 +157,10 @@ public void GetUsfm_StripParagraphs_PreserveParagraphStyles() [Test] public void GetUsfm_PreserveParagraphs() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:0/1:rem"), "Update remark"), - (ScrRef("MAT 1:1"), "Update 1"), + new UpdateUsfmRow(ScrRef("MAT 1:0/1:rem"), "Update remark"), + new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), }; string usfm = @"\id MAT @@ -183,7 +183,7 @@ public void GetUsfm_PreserveParagraphs() AssertUsfmEquals(target, result); - var targetDiffParagraph = UpdateUsfm( + string targetDiffParagraph = UpdateUsfm( rows, usfm, textBehavior: UpdateUsfmTextBehavior.StripExisting, @@ -204,7 +204,7 @@ public void GetUsfm_PreserveParagraphs() [Test] public void GetUsfm_ParagraphInVerse() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1"), }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), }; string usfm = @"\id MAT - Test \c 1 @@ -248,12 +248,12 @@ public void GetUsfm_ParagraphInVerse() [Test] public void GetUsfm_PreferExisting() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:1"), "Update 1"), - (ScrRef("MAT 1:2"), "Update 2"), + new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), + new UpdateUsfmRow(ScrRef("MAT 1:2"), "Update 2"), }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 Some text @@ -261,7 +261,7 @@ public void GetUsfm_PreferExisting() \v 3 Other text "; string target = UpdateUsfm(rows, usfm, textBehavior: UpdateUsfmTextBehavior.PreferExisting); - var result = + string result = @"\id MAT - Test \c 1 \v 1 Some text @@ -274,10 +274,10 @@ public void GetUsfm_PreferExisting() [Test] public void GetUsfm_PreferRows() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:6"), "Text 6"), - (ScrRef("MAT 1:7"), "Text 7"), + new UpdateUsfmRow(ScrRef("MAT 1:6"), "Text 6"), + new UpdateUsfmRow(ScrRef("MAT 1:7"), "Text 7"), }; string target = UpdateUsfm(rows, textBehavior: UpdateUsfmTextBehavior.PreferNew); Assert.That(target, Contains.Substring("\\id MAT - Test\r\n")); @@ -288,9 +288,9 @@ public void GetUsfm_PreferRows() [Test] public void GetUsfm_Verse_StripNote() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:1"), "First verse of the second chapter.") + new UpdateUsfmRow(ScrRef("MAT 2:1"), "First verse of the second chapter.") }; string target = UpdateUsfm(rows, embedBehavior: UpdateUsfmMarkerBehavior.Strip); @@ -300,14 +300,14 @@ public void GetUsfm_Verse_StripNote() [Test] public void GetUsfm_Verse_ReplaceWithNote() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "updated text") }; - var usfm = + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "updated text") }; + string usfm = @"\id MAT - Test \c 1 \v 1 Chapter \add one\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one. "; - var target = UpdateUsfm(rows, usfm); - var result = + string target = UpdateUsfm(rows, usfm); + string result = @"\id MAT - Test \c 1 \v 1 updated text \f + \fr 2:1: \ft This is a footnote.\f* @@ -318,9 +318,9 @@ public void GetUsfm_Verse_ReplaceWithNote() [Test] public void GetUsfm_Verse_RowVerseSegment() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:1a"), "First verse of the second chapter.") + new UpdateUsfmRow(ScrRef("MAT 2:1a"), "First verse of the second chapter.") }; string target = UpdateUsfm(rows); @@ -335,9 +335,9 @@ public void GetUsfm_Verse_RowVerseSegment() [Test] public void GetUsfm_Verse_UsfmVerseSegment() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:7"), "Seventh verse of the second chapter.") + new UpdateUsfmRow(ScrRef("MAT 2:7"), "Seventh verse of the second chapter.") }; string target = UpdateUsfm(rows); @@ -347,9 +347,9 @@ public void GetUsfm_Verse_UsfmVerseSegment() [Test] public void GetUsfm_Verse_MultipleParas() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:2"), "Second verse of the first chapter.") + new UpdateUsfmRow(ScrRef("MAT 1:2"), "Second verse of the first chapter.") }; string target = UpdateUsfm(rows); @@ -364,9 +364,9 @@ public void GetUsfm_Verse_MultipleParas() [Test] public void GetUsfm_Verse_Table() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:9"), "Ninth verse of the second chapter.") + new UpdateUsfmRow(ScrRef("MAT 2:9"), "Ninth verse of the second chapter.") }; string target = UpdateUsfm(rows); @@ -376,9 +376,9 @@ public void GetUsfm_Verse_Table() [Test] public void GetUsfm_Verse_RangeSingleRowMultipleVerses() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - ( + new UpdateUsfmRow( ScrRef("MAT 2:11", "MAT 2:12"), "Eleventh verse of the second chapter. Twelfth verse of the second chapter." ) @@ -396,9 +396,9 @@ public void GetUsfm_Verse_RangeSingleRowMultipleVerses() [Test] public void GetUsfm_Verse_RangeSingleRowSingleVerse() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:11"), "Eleventh verse of the second chapter.") + new UpdateUsfmRow(ScrRef("MAT 2:11"), "Eleventh verse of the second chapter.") }; string target = UpdateUsfm(rows); @@ -408,10 +408,10 @@ public void GetUsfm_Verse_RangeSingleRowSingleVerse() [Test] public void GetUsfm_Verse_RangeMultipleRowsSingleVerse() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:11"), "Eleventh verse of the second chapter."), - (ScrRef("MAT 2:12"), "Twelfth verse of the second chapter.") + new UpdateUsfmRow(ScrRef("MAT 2:11"), "Eleventh verse of the second chapter."), + new UpdateUsfmRow(ScrRef("MAT 2:12"), "Twelfth verse of the second chapter.") }; string target = UpdateUsfm(rows); @@ -426,11 +426,11 @@ public void GetUsfm_Verse_RangeMultipleRowsSingleVerse() [Test] public void GetUsfm_MergeVerseSegments() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:2"), "Verse 2."), - (ScrRef("MAT 2:2a"), "Verse 2a."), - (ScrRef("MAT 2:2b"), "Verse 2b.") + new UpdateUsfmRow(ScrRef("MAT 2:2"), "Verse 2."), + new UpdateUsfmRow(ScrRef("MAT 2:2a"), "Verse 2a."), + new UpdateUsfmRow(ScrRef("MAT 2:2b"), "Verse 2b.") }; string target = UpdateUsfm(rows); @@ -440,10 +440,10 @@ public void GetUsfm_MergeVerseSegments() [Test] public void GetUsfm_Verse_OptBreak() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:2"), "Second verse of the second chapter."), - (ScrRef("MAT 2:3"), "Third verse of the second chapter.") + new UpdateUsfmRow(ScrRef("MAT 2:2"), "Second verse of the second chapter."), + new UpdateUsfmRow(ScrRef("MAT 2:3"), "Third verse of the second chapter.") }; string target = UpdateUsfm(rows, embedBehavior: UpdateUsfmMarkerBehavior.Strip); @@ -456,9 +456,9 @@ public void GetUsfm_Verse_OptBreak() [Test] public void GetUsfm_Verse_Milestone() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:10"), "Tenth verse of the second chapter.") + new UpdateUsfmRow(ScrRef("MAT 2:10"), "Tenth verse of the second chapter.") }; string target = UpdateUsfm(rows); @@ -471,9 +471,9 @@ public void GetUsfm_Verse_Milestone() [Test] public void GetUsfm_Verse_Unmatched() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:3"), "Third verse of the first chapter.") + new UpdateUsfmRow(ScrRef("MAT 1:3"), "Third verse of the first chapter.") }; string target = UpdateUsfm(rows); @@ -483,7 +483,7 @@ public void GetUsfm_Verse_Unmatched() [Test] public void GetUsfm_NonVerse_CharStyle() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 2:0/3:s1"), "The second chapter.") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 2:0/3:s1"), "The second chapter.") }; string target = UpdateUsfm(rows); Assert.That(target, Contains.Substring("\\s1 The second chapter.\r\n")); @@ -492,7 +492,7 @@ public void GetUsfm_NonVerse_CharStyle() [Test] public void GetUsfm_NonVerse_Paragraph() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:0/8:s"), "The first chapter.") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:0/8:s"), "The first chapter.") }; string target = UpdateUsfm(rows); Assert.That(target, Contains.Substring("\\s The first chapter.\r\n")); @@ -501,13 +501,13 @@ public void GetUsfm_NonVerse_Paragraph() [Test] public void GetUsfm_NonVerse_Relaxed() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:0/s"), "The first chapter."), - (ScrRef("MAT 1:1"), "First verse of the first chapter."), - (ScrRef("MAT 2:0/tr/tc1"), "The first cell of the table."), - (ScrRef("MAT 2:0/tr/tc2"), "The second cell of the table."), - (ScrRef("MAT 2:0/tr/tc1"), "The third cell of the table.") + new UpdateUsfmRow(ScrRef("MAT 1:0/s"), "The first chapter."), + new UpdateUsfmRow(ScrRef("MAT 1:1"), "First verse of the first chapter."), + new UpdateUsfmRow(ScrRef("MAT 2:0/tr/tc1"), "The first cell of the table."), + new UpdateUsfmRow(ScrRef("MAT 2:0/tr/tc2"), "The second cell of the table."), + new UpdateUsfmRow(ScrRef("MAT 2:0/tr/tc1"), "The third cell of the table.") }; string target = UpdateUsfm(rows); @@ -531,9 +531,9 @@ public void GetUsfm_NonVerse_Relaxed() [Test] public void GetUsfm_NonVerse_Sidebar() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:3/1:esb/1:ms"), "The first paragraph of the sidebar.") + new UpdateUsfmRow(ScrRef("MAT 2:3/1:esb/1:ms"), "The first paragraph of the sidebar.") }; string target = UpdateUsfm(rows); @@ -543,10 +543,10 @@ public void GetUsfm_NonVerse_Sidebar() [Test] public void GetUsfm_NonVerse_Table() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:0/1:tr/1:tc1"), "The first cell of the table."), - (ScrRef("MAT 2:0/2:tr/1:tc1"), "The third cell of the table.") + new UpdateUsfmRow(ScrRef("MAT 2:0/1:tr/1:tc1"), "The first cell of the table."), + new UpdateUsfmRow(ScrRef("MAT 2:0/2:tr/1:tc1"), "The third cell of the table.") }; string target = UpdateUsfm(rows); @@ -563,9 +563,9 @@ public void GetUsfm_NonVerse_Table() [Test] public void GetUsfm_NonVerse_OptBreak() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 2:3/1:esb/2:p"), "The second paragraph of the sidebar.") + new UpdateUsfmRow(ScrRef("MAT 2:3/1:esb/2:p"), "The second paragraph of the sidebar.") }; string target = UpdateUsfm(rows); @@ -575,10 +575,7 @@ public void GetUsfm_NonVerse_OptBreak() [Test] public void GetUsfm_NonVerse_Milestone() { - var rows = new List<(IReadOnlyList, string)> - { - (ScrRef("MAT 2:7a/1:s"), "A new section header.") - }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 2:7a/1:s"), "A new section header.") }; string target = UpdateUsfm(rows); Assert.That(target, Contains.Substring("\\s A new section header. \\ts-s\\*\r\n")); @@ -587,10 +584,7 @@ public void GetUsfm_NonVerse_Milestone() [Test] public void GetUsfm_NonVerse_SkipNote() { - var rows = new List<(IReadOnlyList, string)> - { - (ScrRef("MAT 1:0/3:ip"), "The introductory paragraph.") - }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:0/3:ip"), "The introductory paragraph.") }; string target = UpdateUsfm(rows, embedBehavior: UpdateUsfmMarkerBehavior.Strip); Assert.That(target, Contains.Substring("\\ip The introductory paragraph.\r\n")); @@ -599,10 +593,7 @@ public void GetUsfm_NonVerse_SkipNote() [Test] public void GetUsfm_NonVerse_ReplaceWithNote() { - var rows = new List<(IReadOnlyList, string)> - { - (ScrRef("MAT 1:0/3:ip"), "The introductory paragraph.") - }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:0/3:ip"), "The introductory paragraph.") }; string target = UpdateUsfm(rows); Assert.That( @@ -614,9 +605,9 @@ public void GetUsfm_NonVerse_ReplaceWithNote() [Test] public void GetUsfm_Verse_DoubleVaVp() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 3:1"), "Updating later in the book to start.") + new UpdateUsfmRow(ScrRef("MAT 3:1"), "Updating later in the book to start.") }; string target = UpdateUsfm(rows); @@ -630,7 +621,7 @@ public void GetUsfm_Verse_DoubleVaVp() [Test] public void GetUsfm_Verse_LastSegment() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Updating the last verse.") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Updating the last verse.") }; string usfm = @"\id MAT - Test \c 1 @@ -653,14 +644,14 @@ public void GetUsfm_Verse_LastSegment() [Test] public void GetUsfm_Verse_PretranslationsBeforeText() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("GEN 1:1"), "Pretranslations before the start"), - (ScrRef("GEN 1:2"), "Pretranslations before the start"), - (ScrRef("GEN 1:3"), "Pretranslations before the start"), - (ScrRef("GEN 1:4"), "Pretranslations before the start"), - (ScrRef("GEN 1:5"), "Pretranslations before the start"), - (ScrRef("MAT 1:0/3:ip"), "The introductory paragraph.") + new UpdateUsfmRow(ScrRef("GEN 1:1"), "Pretranslations before the start"), + new UpdateUsfmRow(ScrRef("GEN 1:2"), "Pretranslations before the start"), + new UpdateUsfmRow(ScrRef("GEN 1:3"), "Pretranslations before the start"), + new UpdateUsfmRow(ScrRef("GEN 1:4"), "Pretranslations before the start"), + new UpdateUsfmRow(ScrRef("GEN 1:5"), "Pretranslations before the start"), + new UpdateUsfmRow(ScrRef("MAT 1:0/3:ip"), "The introductory paragraph.") }; string target = UpdateUsfm(rows); @@ -673,13 +664,13 @@ public void GetUsfm_Verse_PretranslationsBeforeText() [Test] public void GetUsfm_StripParagraphs() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:0/2:p"), "Update Paragraph"), - (ScrRef("MAT 1:1"), "Update Verse 1") + new UpdateUsfmRow(ScrRef("MAT 1:0/2:p"), "Update Paragraph"), + new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update Verse 1") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \p This is a paragraph before any verses @@ -692,7 +683,7 @@ public void GetUsfm_StripParagraphs() "; string target = UpdateUsfm(rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Preserve); - var resultP = + string resultP = @"\id MAT - Test \c 1 \p This is a paragraph before any verses @@ -706,7 +697,7 @@ public void GetUsfm_StripParagraphs() AssertUsfmEquals(target, resultP); target = UpdateUsfm(rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Strip); - var resultS = + string resultS = @"\id MAT - Test \c 1 \p This is a paragraph before any verses @@ -721,19 +712,19 @@ public void GetUsfm_StripParagraphs() [Test] public void GetUsfm_PreservationRawStrings() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:1"), @"Update all in one row \f \fr 1.1 \ft Some note \f*") + new UpdateUsfmRow(ScrRef("MAT 1:1"), @"Update all in one row \f \fr 1.1 \ft Some note \f*") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 \f \fr 1.1 \ft Some note \f*Hello World "; string target = UpdateUsfm(rows, usfm, embedBehavior: UpdateUsfmMarkerBehavior.Strip); - var result = + string result = @"\id MAT - Test \c 1 \v 1 Update all in one row \f \fr 1.1 \ft Some note \f* @@ -744,16 +735,16 @@ public void GetUsfm_PreservationRawStrings() [Test] public void GetUsfm_BeginningOfVerseEmbed() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), @"Updated text") }; + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), @"Updated text") }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 \f \fr 1.1 \ft Some note \f* Text after note "; string target = UpdateUsfm(rows, usfm, embedBehavior: UpdateUsfmMarkerBehavior.Strip); - var result = + string result = @"\id MAT - Test \c 1 \v 1 Updated text @@ -764,17 +755,14 @@ public void GetUsfm_BeginningOfVerseEmbed() [Test] public void CrossReferenceDontUpdate() { - var rows = new List<(IReadOnlyList, string)> - { - (ScrRef("MAT 1:1/1:x"), "Update the cross reference"), - }; - var usfm = + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1/1:x"), "Update the cross reference"), }; + string usfm = @"\id MAT - Test \c 1 \v 1 Cross reference verse \x - \xo 2:3-4 \xt Cool Book 3:24 \xta The annotation \x* and more content. "; - var target = UpdateUsfm(rows, usfm); - var result = + string target = UpdateUsfm(rows, usfm); + string result = @"\id MAT - Test \c 1 \v 1 Cross reference verse \x - \xo 2:3-4 \xt Cool Book 3:24 \xta The annotation \x* and more content. @@ -785,14 +773,14 @@ public void CrossReferenceDontUpdate() [Test] public void PreserveFig() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update"), }; - var usfm = + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update"), }; + string usfm = @"\id MAT - Test \c 1 \v 1 initial text \fig stuff\fig* more text and more. "; - var target = UpdateUsfm(rows, usfm); - var result = + string target = UpdateUsfm(rows, usfm); + string result = @"\id MAT - Test \c 1 \v 1 Update \fig stuff\fig* @@ -803,18 +791,18 @@ public void PreserveFig() [Test] public void NoteExplicitEndMarkers() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:1"), "Update text"), - (ScrRef("MAT 1:1/1:f"), "Update note"), + new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update text"), + new UpdateUsfmRow(ScrRef("MAT 1:1/1:f"), "Update note"), }; - var usfm = + string usfm = @"\id MAT - Test \c 1 \v 1 initial text \f + \fr 2.4\fr* \fk The \+nd Lord\+nd*:\fk* \ft See \+nd Lord\+nd* in Word List.\ft*\f* and the end. "; - var target = UpdateUsfm(rows, usfm); - var result = + string target = UpdateUsfm(rows, usfm); + string result = @"\id MAT - Test \c 1 \v 1 Update text \f + \fr 2.4\fr* \fk The \+nd Lord\+nd*:\fk* \ft See \+nd Lord\+nd* in Word List.\ft*\f* @@ -822,7 +810,7 @@ public void NoteExplicitEndMarkers() AssertUsfmEquals(target, result); target = UpdateUsfm(rows, usfm, embedBehavior: UpdateUsfmMarkerBehavior.Strip); - var result2 = + string result2 = @"\id MAT - Test \c 1 \v 1 Update text @@ -833,8 +821,8 @@ public void NoteExplicitEndMarkers() [Test] public void UpdateBlock_Verse_PreserveParas() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; + string usfm = @"\id MAT - Test \c 1 \v 1 verse 1 \p inner verse paragraph @@ -863,8 +851,8 @@ public void UpdateBlock_Verse_PreserveParas() [Test] public void UpdateBlock_Verse_StripParas() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; + string usfm = @"\id MAT - Test \c 1 \v 1 verse 1 \p inner verse paragraph @@ -893,8 +881,8 @@ public void UpdateBlock_Verse_StripParas() [Test] public void UpdateBlock_Verse_Range() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; + string usfm = @"\id MAT - Test \c 1 \v 1-3 verse 1 through 3 @@ -921,8 +909,8 @@ public void UpdateBlock_Verse_Range() [Test] public void UpdateBlock_Footnote_PreserveEmbeds() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; + string usfm = @"\id MAT - Test \c 1 \v 1 verse\f \fr 1.1 \ft Some note \f* 1 @@ -951,8 +939,8 @@ public void UpdateBlock_Footnote_PreserveEmbeds() [Test] public void UpdateBlock_Footnote_StripEmbeds() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; + string usfm = @"\id MAT - Test \c 1 \v 1 verse\f \fr 1.1 \ft Some note \f* 1 @@ -981,11 +969,8 @@ public void UpdateBlock_Footnote_StripEmbeds() [Test] public void UpdateBlock_NonVerse() { - var rows = new List<(IReadOnlyList, string)> - { - (ScrRef("MAT 1:0/1:s"), "Updated section Header") - }; - var usfm = + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:0/1:s"), "Updated section Header") }; + string usfm = @"\id MAT - Test \s Section header \c 1 @@ -1008,8 +993,8 @@ public void UpdateBlock_NonVerse() [Test] public void UpdateBlock_Verse_PreserveStyles() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; + string usfm = @"\id MAT - Test \c 1 \v 1 verse \bd 1\bd* @@ -1040,8 +1025,8 @@ public void UpdateBlock_Verse_PreserveStyles() [Test] public void UpdateBlock_Verse_StripStyles() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; + string usfm = @"\id MAT - Test \c 1 \v 1 verse \bd 1\bd* @@ -1072,8 +1057,8 @@ public void UpdateBlock_Verse_StripStyles() [Test] public void UpdateBlock_Verse_SectionHeader() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; + string usfm = @"\id MAT - Test \c 1 \p @@ -1110,8 +1095,8 @@ public void UpdateBlock_Verse_SectionHeader() [Test] public void UpdateBlock_Verse_SectionHeaderInVerse() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; + string usfm = @"\id MAT - Test \c 1 \p @@ -1143,8 +1128,8 @@ public void UpdateBlock_Verse_SectionHeaderInVerse() [Test] public void UpdateBlock_NonVerse_ParagraphEndOfVerse() { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; - var usfm = + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1") }; + string usfm = @"\id MAT - Test \c 1 \p @@ -1172,16 +1157,16 @@ public void UpdateBlock_NonVerse_ParagraphEndOfVerse() [Test] public void GetUsfm_HeaderReferenceParagraphs() { - var rows = new List<(IReadOnlyList, string)> + var rows = new List { - (ScrRef("MAT 1:1"), "new verse 1"), - (ScrRef("MAT 1:2"), "new verse 2"), - (ScrRef("MAT 1:3"), "new verse 3"), - (ScrRef("MAT 2:1"), "new verse 1"), - (ScrRef("MAT 2:2"), "new verse 2") + new UpdateUsfmRow(ScrRef("MAT 1:1"), "new verse 1"), + new UpdateUsfmRow(ScrRef("MAT 1:2"), "new verse 2"), + new UpdateUsfmRow(ScrRef("MAT 1:3"), "new verse 3"), + new UpdateUsfmRow(ScrRef("MAT 2:1"), "new verse 1"), + new UpdateUsfmRow(ScrRef("MAT 2:2"), "new verse 2") }; - var usfm = + string usfm = @"\id MAT \c 1 \s1 beginning-of-chapter header @@ -1202,7 +1187,7 @@ public void GetUsfm_HeaderReferenceParagraphs() "; string target = UpdateUsfm(rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Strip); - var resultP = + string resultP = @"\id MAT \c 1 \s1 beginning-of-chapter header @@ -1224,13 +1209,46 @@ public void GetUsfm_HeaderReferenceParagraphs() AssertUsfmEquals(target, resultP); } + [Test] + public void GetUsfm_PreferExisting_AddRemark() + { + var rows = new List + { + new UpdateUsfmRow(ScrRef("MAT 1:1"), "Update 1"), + new UpdateUsfmRow(ScrRef("MAT 1:2"), "Update 2"), + }; + string usfm = + @"\id MAT - Test +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +"; + string target = UpdateUsfm( + rows, + usfm, + textBehavior: UpdateUsfmTextBehavior.PreferExisting, + remarks: ["New remark"] + ); + string result = + @"\id MAT - Test +\rem New remark +\c 1 +\v 1 Some text +\v 2 Update 2 +\v 3 Other text +"; + + AssertUsfmEquals(target, result); + } + private static ScriptureRef[] ScrRef(params string[] refs) { return refs.Select(r => ScriptureRef.Parse(r)).ToArray(); } private static string UpdateUsfm( - IReadOnlyList<(IReadOnlyList, string)>? rows = null, + IReadOnlyList? rows = null, string? source = null, string? idText = null, UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferNew, @@ -1238,7 +1256,8 @@ private static string UpdateUsfm( UpdateUsfmMarkerBehavior embedBehavior = UpdateUsfmMarkerBehavior.Preserve, UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, IEnumerable? preserveParagraphStyles = null, - IEnumerable? usfmUpdateBlockHandlers = null + IEnumerable? usfmUpdateBlockHandlers = null, + IEnumerable? remarks = null ) { if (source is null) @@ -1253,7 +1272,8 @@ private static string UpdateUsfm( embedBehavior, styleBehavior, preserveParagraphStyles, - usfmUpdateBlockHandlers + usfmUpdateBlockHandlers, + remarks ); } else @@ -1267,7 +1287,8 @@ private static string UpdateUsfm( embedBehavior, styleBehavior, preserveParagraphStyles, - usfmUpdateBlockHandlers + usfmUpdateBlockHandlers, + remarks ); UsfmParser.Parse(source, updater); return updater.GetUsfm(); @@ -1277,8 +1298,8 @@ private static string UpdateUsfm( private static void AssertUsfmEquals(string target, string truth) { Assert.That(target, Is.Not.Null); - var target_lines = target.Split(["\n"], StringSplitOptions.None); - var truth_lines = truth.Split(["\n"], StringSplitOptions.None); + string[] target_lines = target.Split(["\n"], StringSplitOptions.None); + string[] truth_lines = truth.Split(["\n"], StringSplitOptions.None); for (int i = 0; i < truth_lines.Length; i++) { Assert.That(target_lines[i].Trim(), Is.EqualTo(truth_lines[i].Trim()), message: $"Line {i}"); @@ -1291,7 +1312,7 @@ private static void AssertUpdateBlockEquals( params (UsfmUpdateBlockElementType, string, bool)[] expectedElements ) { - var parsedExtractedRefs = expectedRefs.Select(r => ScriptureRef.Parse(r)); + IEnumerable parsedExtractedRefs = expectedRefs.Select(r => ScriptureRef.Parse(r)); Assert.That(block.Refs.SequenceEqual(parsedExtractedRefs)); Assert.That(block.Elements.Count, Is.EqualTo(expectedElements.Length)); foreach ( diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs index 63fe388a..30d8ecf7 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs @@ -28,9 +28,10 @@ public void ParseParallelCorpusAsync() Assert.That(rows, Has.Count.GreaterThan(0)); // insert the source into the target as pretranslations to make sure that USFM generation works - IReadOnlyList<(IReadOnlyList, string)> pretranslations = rows.Select(r => - ((IReadOnlyList)r.SourceRefs.Select(s => (ScriptureRef)s).ToList(), r.SourceText) - ) + IReadOnlyList pretranslations = rows.Select(r => new UpdateUsfmRow( + (IReadOnlyList)r.SourceRefs.Select(s => (ScriptureRef)s).ToList(), + r.SourceText + )) .ToList(); ParatextProjectSettings targetSettings = new FileParatextProjectSettingsParser( @@ -96,20 +97,17 @@ async Task GetUsfmAsync(string projectPath) // Read text from pretranslations file using Stream pretranslationStream = File.OpenRead(PretranslationPath); - (IReadOnlyList, string)[] pretranslations = await JsonSerializer + UpdateUsfmRow[] pretranslations = await JsonSerializer .DeserializeAsyncEnumerable( pretranslationStream, new JsonSerializerOptions { PropertyNamingPolicy = JsonNamingPolicy.CamelCase } ) - .Select(p => - ( - (IReadOnlyList)( - p?.Refs.Select(r => ScriptureRef.Parse(r, settings.Versification).ToRelaxed()).ToArray() - ?? [] - ), - p?.Translation ?? "" - ) - ) + .Select(p => new UpdateUsfmRow( + (IReadOnlyList)( + p?.Refs.Select(r => ScriptureRef.Parse(r, settings.Versification).ToRelaxed()).ToArray() ?? [] + ), + p?.Translation ?? "" + )) .ToArrayAsync(); List bookIds = []; ParatextProjectTextUpdaterBase updater; diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/ChapterTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/ChapterTests.cs new file mode 100644 index 00000000..d06e55f4 --- /dev/null +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/ChapterTests.cs @@ -0,0 +1,33 @@ +using NUnit.Framework; + +namespace SIL.Machine.PunctuationAnalysis; + +[TestFixture] +public class ChapterTests +{ + [Test] + public void InitializeVerse() + { + List textSegments1 = + [ + new TextSegment.Builder().SetText("Segment 1").Build(), + new TextSegment.Builder().SetText("Segment 2").Build(), + new TextSegment.Builder().SetText("Segment 3").Build(), + ]; + var verse1 = new Verse(textSegments1); + + List textSegments2 = + [ + new TextSegment.Builder().SetText("Segment 4").Build(), + new TextSegment.Builder().SetText("Segment 5").Build(), + new TextSegment.Builder().SetText("Segment 6").Build(), + ]; + var verse2 = new Verse(textSegments2); + + var chapter = new Chapter([verse1, verse2]); + + Assert.That(chapter.Verses, Has.Count.EqualTo(2)); + Assert.That(chapter.Verses[0].TextSegments, Is.EqualTo(textSegments1)); + Assert.That(chapter.Verses[1].TextSegments, Is.EqualTo(textSegments2)); + } +} diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs new file mode 100644 index 00000000..3576ed95 --- /dev/null +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/DepthBasedQuotationMarkResolverTests.cs @@ -0,0 +1,3461 @@ +using NUnit.Framework; + +namespace SIL.Machine.PunctuationAnalysis; + +[TestFixture] +public class DepthBasedQuotationMarkResolverTests +{ + [Test] + public void CurrentDepthQuotationMarkResolverState() + { + var state = new QuotationMarkResolverState(); + Assert.That(state.CurrentDepth, Is.EqualTo(0)); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.That(state.CurrentDepth, Is.EqualTo(1)); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.That(state.CurrentDepth, Is.EqualTo(2)); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ); + Assert.That(state.CurrentDepth, Is.EqualTo(1)); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ); + Assert.That(state.CurrentDepth, Is.EqualTo(0)); + } + + [Test] + public void HasOpenQuotationMark() + { + var state = new QuotationMarkResolverState(); + Assert.IsFalse(state.HasOpenQuotationMark); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsTrue(state.HasOpenQuotationMark); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.IsTrue(state.HasOpenQuotationMark); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ); + Assert.IsTrue(state.HasOpenQuotationMark); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ); + Assert.IsFalse(state.HasOpenQuotationMark); + } + + [Test] + public void AreMoreThanNQuotesOpen() + { + var state = new QuotationMarkResolverState(); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(1)); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(2)); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(1)); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(2)); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.IsTrue(state.AreMoreThanNQuotesOpen(1)); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(2)); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(1)); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(2)); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(1)); + Assert.IsFalse(state.AreMoreThanNQuotesOpen(2)); + } + + [Test] + public void GetOpeningQuotationMarkAtDepth() + { + var state = new QuotationMarkResolverState(); + Assert.Throws(() => state.GetOpeningQuotationMarkAtDepth(1)); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.That(state.GetOpeningQuotationMarkAtDepth(1), Is.EqualTo("\u201c")); + Assert.Throws(() => state.GetOpeningQuotationMarkAtDepth(2)); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.That(state.GetOpeningQuotationMarkAtDepth(1), Is.EqualTo("\u201c")); + Assert.That(state.GetOpeningQuotationMarkAtDepth(2), Is.EqualTo("\u2018")); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ); + Assert.That(state.GetOpeningQuotationMarkAtDepth(1), Is.EqualTo("\u201c")); + Assert.Throws(() => state.GetOpeningQuotationMarkAtDepth(2)); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ); + Assert.Throws(() => state.GetOpeningQuotationMarkAtDepth(1)); + } + + [Test] + public void GetDeepestOpeningMark() + { + var state = new QuotationMarkResolverState(); + Assert.Throws(() => state.GetDeepestOpeningQuotationMark()); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.That(state.GetDeepestOpeningQuotationMark(), Is.EqualTo("\u201c")); + + state.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.That(state.GetDeepestOpeningQuotationMark(), Is.EqualTo("\u2018")); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ); + Assert.That(state.GetDeepestOpeningQuotationMark(), Is.EqualTo("\u201c")); + + state.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ); + Assert.Throws(() => state.GetDeepestOpeningQuotationMark()); + } + + [Test] + public void GetCurrentDepthQuotationContinuerState() + { + var resolverState = new QuotationMarkResolverState(); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + + var continuerState = new TestQuoteContinuerState(); + Assert.That(continuerState.CurrentDepth, Is.EqualTo(0)); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.That(continuerState.CurrentDepth, Is.EqualTo(1)); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.That(continuerState.CurrentDepth, Is.EqualTo(2)); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.That(continuerState.CurrentDepth, Is.EqualTo(0)); + } + + [Test] + public void HasContinuerBeenObserved() + { + var resolverState = new QuotationMarkResolverState(); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + + var continuerState = new TestQuoteContinuerState(); + Assert.IsFalse(continuerState.ContinuerHasBeenObserved()); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.IsTrue(continuerState.ContinuerHasBeenObserved()); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.IsTrue(continuerState.ContinuerHasBeenObserved()); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.IsFalse(continuerState.ContinuerHasBeenObserved()); + } + + [Test] + public void GetContinuerStyle() + { + var resolverState = new QuotationMarkResolverState(); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + + var continuerState = new TestQuoteContinuerState(); + Assert.That(continuerState.InternalContinuerStyle, Is.EqualTo(QuoteContinuerStyle.Undetermined)); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.That(continuerState.InternalContinuerStyle, Is.EqualTo(QuoteContinuerStyle.English)); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.Spanish + ); + Assert.That(continuerState.InternalContinuerStyle, Is.EqualTo(QuoteContinuerStyle.Spanish)); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.That(continuerState.InternalContinuerStyle, Is.EqualTo(QuoteContinuerStyle.English)); + } + + [Test] + public void AddQuotationContinuer() + { + var resolverState = new QuotationMarkResolverState(); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + + var continuerState = new TestQuoteContinuerState(); + + QuotationMarkMetadata result1 = continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.That( + result1, + Is.EqualTo( + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().SetText("\u201c").Build(), + 0, + 1 + ) + ) + ); + + QuotationMarkMetadata result2 = continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.Spanish + ); + Assert.That( + result2, + Is.EqualTo( + new QuotationMarkMetadata( + "\u2018", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().SetText("\u2018").Build(), + 0, + 1 + ) + ) + ); + Assert.That(continuerState.InternalContinuerStyle, Is.EqualTo(QuoteContinuerStyle.Spanish)); + + QuotationMarkMetadata result3 = continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1), + resolverState, + QuoteContinuerStyle.English + ); + Assert.That( + result3, + Is.EqualTo( + new QuotationMarkMetadata( + "\u201c", + 3, + QuotationMarkDirection.Opening, + new TextSegment.Builder().SetText("\u201c").Build(), + 0, + 1 + ) + ) + ); + } + + [Test] + public void IsEnglishQuotationContinuer() + { + QuoteConvention standardEnglish = QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + Assert.IsNotNull(standardEnglish); + + var settings = new QuoteConventionDetectionResolutionSettings(new QuoteConventionSet([standardEnglish])); + var resolverState = new QuotationMarkResolverState(); + var continuerState = new TestQuoteContinuerState(); + var categorizer = new QuotationMarkCategorizer(settings, resolverState, continuerState); + + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + + // Should always be false if the continuer style is Spanish + continuerState.InternalContinuerStyle = QuoteContinuerStyle.English; + Assert.IsTrue( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + + continuerState.InternalContinuerStyle = QuoteContinuerStyle.Spanish; + Assert.IsFalse( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + continuerState.InternalContinuerStyle = QuoteContinuerStyle.English; + + // Should be false if there's no preceding paragraph marker (and the settings say to rely on markers) + Assert.IsFalse( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201ctest").Build(), 0, 1), + null, + null + ) + ); + + Assert.IsTrue( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + + var categorizerForDenorm = new QuotationMarkCategorizer( + new QuotationMarkUpdateResolutionSettings(standardEnglish), + resolverState, + continuerState + ); + Assert.IsTrue( + categorizerForDenorm.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201ctest").Build(), 0, 1), + null, + null + ) + ); + + // Should be false if there are no open quotation marks + var emptyState = new QuotationMarkResolverState(); + var emptyCategorizer = new QuotationMarkCategorizer(settings, emptyState, continuerState); + Assert.IsFalse( + emptyCategorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + + // Should be false if the starting index of the quotation mark is greater than 0 + Assert.IsFalse( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText(" \u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + null, + null + ) + ); + + // Should be false if the mark does not match the already opened mark + Assert.IsFalse( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u2018test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + + // If there are multiple open quotes, the next quote continuer must follow immediately after the current one + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.IsFalse( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + Assert.IsTrue( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u2018test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u2018test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ) + ) + ); + Assert.IsTrue( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ) + ) + ); + + // When there are multiple open quotes, the continuer must match the deepest observed mark + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u2018test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + resolverState, + QuoteContinuerStyle.English + ); + + Assert.IsFalse( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + null, + null + ) + ); + Assert.IsTrue( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u2018test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + null, + null + ) + ); + + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsTrue( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u2018\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + null, + null + ) + ); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u2018\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + resolverState, + QuoteContinuerStyle.English + ); + + Assert.IsFalse( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u2018\u2018test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 2, + 3 + ), + null, + null + ) + ); + Assert.IsTrue( + categorizer.IsEnglishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201c\u2018\u201ctest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 2, + 3 + ), + null, + null + ) + ); + } + + [Test] + public void IsSpanishQuotationContinuer() + { + QuoteConvention westernEuropeanQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + "western_european" + ); + Assert.IsNotNull(westernEuropeanQuoteConvention); + + var settings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([westernEuropeanQuoteConvention]) + ); + var resolverState = new QuotationMarkResolverState(); + var continuerState = new TestQuoteContinuerState(); + var categorizer = new QuotationMarkCategorizer(settings, resolverState, continuerState); + + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00ab").Build(), 0, 1) + ); + + // Should always be false if the continuer style is Spanish + continuerState.InternalContinuerStyle = QuoteContinuerStyle.Spanish; + Assert.IsTrue( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + + continuerState.InternalContinuerStyle = QuoteContinuerStyle.English; + Assert.IsFalse( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + continuerState.InternalContinuerStyle = QuoteContinuerStyle.Spanish; + + // Should be false if there's no preceding paragraph marker (and the settings say to rely on markers) + Assert.IsFalse( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bbtest").Build(), 0, 1), + null, + null + ) + ); + + Assert.IsTrue( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + + var categorizerForDenorm = new QuotationMarkCategorizer( + new QuotationMarkUpdateResolutionSettings(westernEuropeanQuoteConvention), + resolverState, + continuerState + ); + Assert.IsTrue( + categorizerForDenorm.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bbtest").Build(), 0, 1), + null, + null + ) + ); + + // Should be false if there are no open quotation marks + var emptyState = new QuotationMarkResolverState(); + var emptyCategorizer = new QuotationMarkCategorizer(settings, emptyState, continuerState); + Assert.IsFalse( + emptyCategorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + + // Should be false if the starting index of the quotation mark is greater than 0 + Assert.IsFalse( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText(" \u00bbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + null, + null + ) + ); + + // Should be false if the mark does not match the already opened mark + Assert.IsFalse( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u201dtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + + // If there are multiple open quotes, the next quote continuer must follow immediately after the current one + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsFalse( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + null + ) + ); + Assert.IsTrue( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201dtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201dtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ) + ) + ); + Assert.IsTrue( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u00bbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + null, + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u00bbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ) + ) + ); + + // When there are multiple open quotes, the continuer must match the deepest observed mark + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201dtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ), + resolverState, + QuoteContinuerStyle.Spanish + ); + + Assert.IsFalse( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201cbtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + null, + null + ) + ); + Assert.IsTrue( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201dtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + null, + null + ) + ); + + resolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.IsTrue( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201d\u2019test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + null, + null + ) + ); + + continuerState.AddQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201d\u2019test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 1, + 2 + ), + resolverState, + QuoteContinuerStyle.Spanish + ); + + Assert.IsFalse( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201d\u201dtest") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 2, + 3 + ), + null, + null + ) + ); + Assert.IsTrue( + categorizer.IsSpanishQuoteContinuer( + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("\u00bb\u201d\u2019test") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 2, + 3 + ), + null, + null + ) + ); + } + + [Test] + public void IsOpeningQuote() + { + QuoteConvention centralEuropeanQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("central_european") + ); + Assert.IsNotNull(centralEuropeanQuoteConvention); + var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([centralEuropeanQuoteConvention]) + ); + var quotationMarkResolverState = new QuotationMarkResolverState(); + var quotationContinuerState = new QuoteContinuerState(); + var centralEuropeanQuotationMarkCategorizer = new QuotationMarkCategorizer( + centralEuropeanResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + QuoteConvention britishEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("british_english") + ); + Assert.IsNotNull(britishEnglishQuoteConvention); + var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([britishEnglishQuoteConvention]) + ); + var britishEnglishQuotationMarkCategorizer = new QuotationMarkCategorizer( + britishEnglishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + QuoteConvention standardSwedishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish") + ); + Assert.IsNotNull(standardSwedishQuoteConvention); + var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardSwedishQuoteConvention]) + ); + var standardSwedishQuotationMarkCategorizer = new QuotationMarkCategorizer( + standardSwedishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var threeConventionsResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet( + [centralEuropeanQuoteConvention, britishEnglishQuoteConvention, standardSwedishQuoteConvention] + ) + ); + var threeConventionsQuotationMarkCategorizer = new QuotationMarkCategorizer( + threeConventionsResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + // It should only accept valid opening marks under the quote convention + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201e").Build(), 1, 2) + ) + ); + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201a").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u00ab").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \"").Build(), 1, 2) + ) + ); + + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201e").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201a").Build(), 1, 2) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c").Build(), 1, 2) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u00ab").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \"").Build(), 1, 2) + ) + ); + + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201e").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201a").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018").Build(), 1, 2) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d").Build(), 1, 2) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u00ab").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \"").Build(), 1, 2) + ) + ); + + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201e").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201a").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u00ab").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \"").Build(), 1, 2) + ) + ); + + // Leading whitespace is not necessary for unambiguous opening quotes + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("text\u201e").Build(), 4, 5) + ) + ); + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("text\u201a").Build(), 4, 5) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("text\u201c").Build(), 4, 5) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("text\u2018").Build(), 4, 5) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("text\u201e").Build(), 4, 5) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("text\u201a").Build(), 4, 5) + ) + ); + + // An ambiguous quotation mark (opening/closing) is recognized as opening if it has a quote introducer beforehand + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(",\u201d").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(":\u2019").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(",\u201c").Build(), 1, 2) + ) + ); + + // An ambiguous quotation mark (opening/closing) is recognized as opening if preceded by another opening mark + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u2019").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201c").Build(), 1, 2) + ) + ); + + // An ambiguous quotation mark (opening/closing) is not recognized as opening if it has trailing whitespace or punctuation + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d.").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(",\u201d ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u2019 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u2019?").Build(), 1, 2) + ) + ); + } + + [Test] + public void IsClosingQuote() + { + QuoteConvention centralEuropeanQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("central_european") + ); + Assert.IsNotNull(centralEuropeanQuoteConvention); + var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([centralEuropeanQuoteConvention]) + ); + var quotationMarkResolverState = new QuotationMarkResolverState(); + var quotationContinuerState = new QuoteContinuerState(); + var centralEuropeanQuotationMarkCategorizer = new QuotationMarkCategorizer( + centralEuropeanResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + QuoteConvention britishEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("british_english") + ); + Assert.IsNotNull(britishEnglishQuoteConvention); + var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([britishEnglishQuoteConvention]) + ); + var britishEnglishQuotationMarkCategorizer = new QuotationMarkCategorizer( + britishEnglishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + QuoteConvention standardSwedishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish") + ); + Assert.IsNotNull(standardSwedishQuoteConvention); + var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardSwedishQuoteConvention]) + ); + var standardSwedishQuotationMarkCategorizer = new QuotationMarkCategorizer( + standardSwedishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + QuoteConvention standardFrenchQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_french") + ); + Assert.IsNotNull(standardFrenchQuoteConvention); + var standardFrenchResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardFrenchQuoteConvention]) + ); + var standardFrenchQuotationMarkCategorizer = new QuotationMarkCategorizer( + standardFrenchResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var threeConventionsResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet( + [centralEuropeanQuoteConvention, britishEnglishQuoteConvention, standardSwedishQuoteConvention] + ) + ); + var threeConventionsQuotationMarkCategorizer = new QuotationMarkCategorizer( + threeConventionsResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + // It should only accept valid closing marks under the quote convention + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018 ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201e ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201a ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019 ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\" ").Build(), 0, 1) + ) + ); + + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018 ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019 ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\" ").Build(), 0, 1) + ) + ); + + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018 ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019 ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\" ").Build(), 0, 1) + ) + ); + + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018 ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019 ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\" ").Build(), 0, 1) + ) + ); + + // Trailing whitespace is not necessary for unambiguous closing quotes + Assert.IsTrue( + standardFrenchQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bbtext").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardFrenchQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u203atext").Build(), 0, 1) + ) + ); + + // An ambiguous quotation mark (opening/closing) is recognized as closing if + // followed by whitespace, punctuation or the end of the segment + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201dtext").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019text").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019?").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201ctext").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c?").Build(), 0, 1) + ) + ); + + // An ambiguous quotation mark (opening/closing) is not recognized as opening if + // it has leading whitespace + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\t\u201c?").Build(), 1, 2) + ) + ); + } + + [Test] + public void IsMalformedOpeningQuote() + { + QuoteConvention centralEuropeanQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("central_european") + ); + Assert.IsNotNull(centralEuropeanQuoteConvention); + var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([centralEuropeanQuoteConvention]) + ); + var quotationMarkResolverState = new QuotationMarkResolverState(); + var quotationContinuerState = new QuoteContinuerState(); + var centralEuropeanQuotationMarkCategorizer = new QuotationMarkCategorizer( + centralEuropeanResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + QuoteConvention britishEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("british_english") + ); + Assert.IsNotNull(britishEnglishQuoteConvention); + var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([britishEnglishQuoteConvention]) + ); + var britishEnglishQuotationMarkCategorizer = new QuotationMarkCategorizer( + britishEnglishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + QuoteConvention standardSwedishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish") + ); + Assert.IsNotNull(standardSwedishQuoteConvention); + var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardSwedishQuoteConvention]) + ); + var standardSwedishQuotationMarkCategorizer = new QuotationMarkCategorizer( + standardSwedishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var threeConventionsResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet( + [centralEuropeanQuoteConvention, britishEnglishQuoteConvention, standardSwedishQuoteConvention] + ) + ); + var threeConventionsQuotationMarkCategorizer = new QuotationMarkCategorizer( + threeConventionsResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + // It should only accept valid opening marks under the quote convention + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201e ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201a ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u00ab ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \" ").Build(), 1, 2) + ) + ); + + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201e ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201a ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u00ab ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \" ").Build(), 1, 2) + ) + ); + + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201e ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201a ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018 ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u00ab ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \" ").Build(), 1, 2) + ) + ); + + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201e ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201a ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018 ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d ").Build(), 1, 2) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u00ab ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \" ").Build(), 1, 2) + ) + ); + + // Should return true if there is a leading quote introducer + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(",\u201d ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019 ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(":\u2019 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(",\u201c ").Build(), 1, 2) + ) + ); + + // Should return false unless the mark has leading and trailing whitespace + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d").Build(), 1, 2) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d ").Build(), 1, 2) + ) + ); + + // Should return false if there is already an open quotation mark on the stack + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019 ").Build(), 1, 2) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201c ").Build(), 1, 2) + ) + ); + } + + [Test] + public void IsMalformedClosingQuote() + { + QuoteConvention centralEuropeanQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("central_european") + ); + Assert.IsNotNull(centralEuropeanQuoteConvention); + var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([centralEuropeanQuoteConvention]) + ); + var quotationMarkResolverState = new QuotationMarkResolverState(); + var quotationContinuerState = new QuoteContinuerState(); + var centralEuropeanQuotationMarkCategorizer = new QuotationMarkCategorizer( + centralEuropeanResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + QuoteConvention britishEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("british_english") + ); + Assert.IsNotNull(britishEnglishQuoteConvention); + var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([britishEnglishQuoteConvention]) + ); + var britishEnglishQuotationMarkCategorizer = new QuotationMarkCategorizer( + britishEnglishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + QuoteConvention standardSwedishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish") + ); + Assert.IsNotNull(standardSwedishQuoteConvention); + var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardSwedishQuoteConvention]) + ); + var standardSwedishQuotationMarkCategorizer = new QuotationMarkCategorizer( + standardSwedishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var threeConventionsResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet( + [centralEuropeanQuoteConvention, britishEnglishQuoteConvention, standardSwedishQuoteConvention] + ) + ); + var threeConventionsQuotationMarkCategorizer = new QuotationMarkCategorizer( + threeConventionsResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + // It should only accept valid closing marks under the quote convention + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201e").Build(), 0, 1) + ); + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201e").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201a").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ) + ); + + quotationMarkResolverState.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ) + ); + + quotationMarkResolverState.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ); + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ) + ); + + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ) + ); + + // Returns true if it's at the end of the segment + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + + // Returns true if it does not have trailing whitespace + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d-").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201dtext").Build(), 0, 1) + ) + ); + + // Returns true if it has trailing and leading whitespace + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d ").Build(), 1, 2) + ) + ); + + // Requires there to be an open quotation mark on the stack + quotationMarkResolverState.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + + // Requires the quotation mark on the stack to be a valid pair with the + // observed quotation mark + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsMalformedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + } + + [Test] + public void IsUnpairedClosingQuote() + { + QuoteConvention centralEuropeanQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("central_european") + ); + Assert.IsNotNull(centralEuropeanQuoteConvention); + var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([centralEuropeanQuoteConvention]) + ); + var quotationMarkResolverState = new QuotationMarkResolverState(); + var quotationContinuerState = new QuoteContinuerState(); + var centralEuropeanQuotationMarkCategorizer = new QuotationMarkCategorizer( + centralEuropeanResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + QuoteConvention britishEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("british_english") + ); + Assert.IsNotNull(britishEnglishQuoteConvention); + var britishEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([britishEnglishQuoteConvention]) + ); + var britishEnglishQuotationMarkCategorizer = new QuotationMarkCategorizer( + britishEnglishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + QuoteConvention standardSwedishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish") + ); + Assert.IsNotNull(standardSwedishQuoteConvention); + var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardSwedishQuoteConvention]) + ); + var standardSwedishQuotationMarkCategorizer = new QuotationMarkCategorizer( + standardSwedishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + var threeConventionsResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet( + [centralEuropeanQuoteConvention, britishEnglishQuoteConvention, standardSwedishQuoteConvention] + ) + ); + var threeConventionsQuotationMarkCategorizer = new QuotationMarkCategorizer( + threeConventionsResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + // It should only accept valid closing marks under the quote convention + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsTrue( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201e").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201a").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ) + ); + + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ) + ); + + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsTrue( + standardSwedishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ) + ); + + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsTrue( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u00bb").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ) + ); + + // There must not be an opening quotation mark on the stack + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + centralEuropeanQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + standardSwedishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsFalse( + threeConventionsQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ) + ); + + // There must not be leading whitespace + quotationMarkResolverState.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u201d").Build(), 1, 2) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\t\u2019").Build(), 1, 2) + ) + ); + + // The quotation mark must be either at the end of the segment + // or have trailing whitespace + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d").Build(), 0, 1) + ) + ); + Assert.IsTrue( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d ").Build(), 0, 1) + ) + ); + Assert.IsFalse( + britishEnglishQuotationMarkCategorizer.IsUnpairedClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d?").Build(), 0, 1) + ) + ); + } + + [Test] + public void IsApostrophe() + { + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var quotationMarkResolverState = new QuotationMarkResolverState(); + var quotationContinuerState = new QuoteContinuerState(); + var standardEnglishQuotationMarkCategorizer = new QuotationMarkCategorizer( + standardEnglishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + QuoteConvention typewriterEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("typewriter_english") + ); + Assert.IsNotNull(typewriterEnglishQuoteConvention); + var typewriterEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + var typewriterEnglishQuotationMarkCategorizer = new QuotationMarkCategorizer( + typewriterEnglishResolverSettings, + quotationMarkResolverState, + quotationContinuerState + ); + + // The quotation mark must make for a plausible apostrophe + Assert.IsTrue( + typewriterEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a'b").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + typewriterEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u2019b").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + typewriterEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u2018b").Build(), 1, 2), + null + ) + ); + Assert.IsFalse( + typewriterEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u201cb").Build(), 1, 2), + null + ) + ); + Assert.IsFalse( + typewriterEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\"b").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a'b").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u2019b").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u2018b").Build(), 1, 2), + null + ) + ); + Assert.IsFalse( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u201cb").Build(), 1, 2), + null + ) + ); + Assert.IsFalse( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\"b").Build(), 1, 2), + null + ) + ); + + // Returns true if the mark has Latin letters on both sides + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u2019Ƅ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("ǡ\u2019b").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("ᴀ\u2019B").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("𝼀\u2019Ꝙ").Build(), 1, 2), + null + ) + ); + + Assert.IsFalse( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u2019ℵ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + typewriterEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("a\u2019ℵ").Build(), 1, 2), + null + ) + ); + + // Recognizes s possessives (e.G. Moses') + quotationMarkResolverState.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2019").Build(), 0, 1) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("s\u2019 ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("Moses\u2019 ").Build(), 5, 6), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("s\u2019?").Build(), 1, 2), + null + ) + ); + Assert.IsFalse( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("s\u20195").Build(), 1, 2), + null + ) + ); + + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("s\u2019 ").Build(), 1, 2), + null + ) + ); + + quotationMarkResolverState.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ); + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018").Build(), 0, 1) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("s\u2019 ").Build(), 1, 2), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word\u2019").Build(), 4, 5) + ) + ); + Assert.IsFalse( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("s\u2019 ").Build(), 1, 2), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word\u201d").Build(), 4, 5) + ) + ); + + // the straight quote should always be an apostrophe if it's not a valid quotation mark + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("5'ℵ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" ' ").Build(), 1, 2), + null + ) + ); + + // the straight quote should be an apostrophe if there's nothing on the quotation mark stack + quotationMarkResolverState.AddClosingQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("5'ℵ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" ' ").Build(), 1, 2), + null + ) + ); + + // any matching mark should be an apostrophe if it doesn't pair with the + // deepest opening quotation mark on the stack + // (opening/closing quotation marks will have been detected before calling this) + quotationMarkResolverState.AddOpeningQuotationMark( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("5'ℵ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" ' ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("5\u2018ℵ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2018 ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("5\u2019ℵ").Build(), 1, 2), + null + ) + ); + Assert.IsTrue( + standardEnglishQuotationMarkCategorizer.IsApostrophe( + new QuotationMarkStringMatch(new TextSegment.Builder().SetText(" \u2019 ").Build(), 1, 2), + null + ) + ); + } + + [Test] + public void DepthBasedQuotationMarkResolverReset() + { + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201cThis is a quote").Build(), 0, 1)] + ) + .ToList(); + + Assert.That( + standardEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.UnpairedQuotationMark]) + ); + + standardEnglishQuotationMarkResolver.Reset(); + Assert.That(standardEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("This is a quote\u2019").Build(), + 15, + 16 + ) + ] + ) + .ToList(); + + Assert.That( + standardEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.UnpairedQuotationMark]) + ); + } + + [Test] + public void BasicQuotationMarkRecognition() + { + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + TextSegment textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote\u2019\u201d").Build(); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 11, 12), + new QuotationMarkStringMatch(textSegment, 17, 18), + new QuotationMarkStringMatch(textSegment, 18, 19), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, textSegment, 11, 12), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, textSegment, 17, 18), + new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegment, 18, 19), + ] + ) + ); + Assert.That(standardEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void ResolutionOnlyOfPassedMatches() + { + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + TextSegment textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote\u2019\u201d").Build(); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks([new QuotationMarkStringMatch(textSegment, 0, 1),]) + .SequenceEqual( + [new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment, 0, 1),] + ) + ); + Assert.That( + standardEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.UnpairedQuotationMark]) + ); + + textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote\u2019\u201d").Build(); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks([new QuotationMarkStringMatch(textSegment, 17, 18),]) + .Count(), + Is.EqualTo(0) + ); + Assert.That( + standardEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.UnpairedQuotationMark]) + ); + } + + [Test] + public void ResolutionAcrossSegments() + { + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + TextSegment textSegment1 = new TextSegment.Builder().SetText("\u201cThis is a ").Build(); + TextSegment textSegment2 = new TextSegment.Builder().SetText("\u2018quote\u2019\u201d").Build(); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment1, 0, 1), + new QuotationMarkStringMatch(textSegment2, 0, 1), + new QuotationMarkStringMatch(textSegment2, 6, 7), + new QuotationMarkStringMatch(textSegment2, 7, 8), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment1, 0, 1), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, textSegment2, 0, 1), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, textSegment2, 6, 7), + new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegment2, 7, 8), + ] + ) + ); + Assert.That(standardEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void ResolutionWithApostrophes() + { + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + TextSegment textSegment = ( + new TextSegment.Builder() + .SetText("\u201cThis\u2019 is a \u2018quote\u2019\u201d") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 5, 6), + new QuotationMarkStringMatch(textSegment, 12, 13), + new QuotationMarkStringMatch(textSegment, 18, 19), + new QuotationMarkStringMatch(textSegment, 19, 20), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, textSegment, 12, 13), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, textSegment, 18, 19), + new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegment, 19, 20), + ] + ) + ); + Assert.That(standardEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + + QuoteConvention typewriterEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("typewriter_english") + ); + Assert.IsNotNull(typewriterEnglishQuoteConvention); + var typewriterEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + var typewriterEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver( + typewriterEnglishResolverSettings + ); + + textSegment = new TextSegment.Builder() + .SetText("\"This' is a 'quote'\"") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(); + Assert.That( + typewriterEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 5, 6), + new QuotationMarkStringMatch(textSegment, 12, 13), + new QuotationMarkStringMatch(textSegment, 18, 19), + new QuotationMarkStringMatch(textSegment, 19, 20), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\"", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("'", 2, QuotationMarkDirection.Opening, textSegment, 12, 13), + new QuotationMarkMetadata("'", 2, QuotationMarkDirection.Closing, textSegment, 18, 19), + new QuotationMarkMetadata("\"", 1, QuotationMarkDirection.Closing, textSegment, 19, 20), + ] + ) + ); + Assert.That(standardEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void EnglishQuoteContinuers() + { + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + TextSegment textSegment1 = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote").Build(); + TextSegment textSegment2 = ( + new TextSegment.Builder() + .SetText("\u201c\u2018This is the rest\u2019 of it\u201d") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment1, 0, 1), + new QuotationMarkStringMatch(textSegment1, 11, 12), + new QuotationMarkStringMatch(textSegment2, 0, 1), + new QuotationMarkStringMatch(textSegment2, 1, 2), + new QuotationMarkStringMatch(textSegment2, 18, 19), + new QuotationMarkStringMatch(textSegment2, 25, 26), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment1, 0, 1), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, textSegment1, 11, 12), + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment2, 0, 1), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, textSegment2, 1, 2), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, textSegment2, 18, 19), + new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegment2, 25, 26), + ] + ) + ); + Assert.That(standardEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void SpanishQuoteContinuers() + { + QuoteConvention westernEuropeanQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("western_european") + ); + Assert.IsNotNull(westernEuropeanQuoteConvention); + var westernEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([westernEuropeanQuoteConvention]) + ); + var westernEuropeanQuotationMarkResolver = new DepthBasedQuotationMarkResolver(westernEuropeanResolverSettings); + + TextSegment textSegment1 = new TextSegment.Builder().SetText("\u00abThis is a \u201cquote").Build(); + TextSegment textSegment2 = ( + new TextSegment.Builder() + .SetText("\u00bb\u201dThis is the rest\u201d of it\u00bb") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ); + Assert.That( + westernEuropeanQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment1, 0, 1), + new QuotationMarkStringMatch(textSegment1, 11, 12), + new QuotationMarkStringMatch(textSegment2, 0, 1), + new QuotationMarkStringMatch(textSegment2, 1, 2), + new QuotationMarkStringMatch(textSegment2, 18, 19), + new QuotationMarkStringMatch(textSegment2, 25, 26), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, textSegment1, 0, 1), + new QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.Opening, textSegment1, 11, 12), + new QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Opening, textSegment2, 0, 1), + new QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.Opening, textSegment2, 1, 2), + new QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.Closing, textSegment2, 18, 19), + new QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Closing, textSegment2, 25, 26), + ] + ) + ); + Assert.That(westernEuropeanQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void MalformedQuotationMarks() + { + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + TextSegment textSegment1 = new TextSegment.Builder().SetText("\u201c This is a,\u2018 quote").Build(); + TextSegment textSegment2 = ( + new TextSegment.Builder() + .SetText("This is the rest \u2019 of it \u201d") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment1, 0, 1), + new QuotationMarkStringMatch(textSegment1, 12, 13), + new QuotationMarkStringMatch(textSegment2, 17, 18), + new QuotationMarkStringMatch(textSegment2, 25, 26), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment1, 0, 1), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, textSegment1, 12, 13), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, textSegment2, 17, 18), + new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegment2, 25, 26), + ] + ) + ); + Assert.That(standardEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void UnpairedQuotationMarkIssue() + { + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + TextSegment textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u2018quote\u2019").Build(); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 11, 12), + new QuotationMarkStringMatch(textSegment, 17, 18), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, textSegment, 11, 12), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, textSegment, 17, 18), + ] + ) + ); + Assert.That( + standardEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.UnpairedQuotationMark]) + ); + + textSegment = new TextSegment.Builder().SetText("another quote\u201d").Build(); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks([new QuotationMarkStringMatch(textSegment, 13, 14),]) + .SequenceEqual( + [new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegment, 13, 14),] + ) + ); + Assert.That( + standardEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.UnpairedQuotationMark]) + ); + } + + [Test] + public void TooDeepNestingIssue() + { + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + TextSegment textSegment = new TextSegment.Builder() + .SetText("\u201cThis \u2018is \u201ca \u2018quote \u201cnested too deeply") + .Build(); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 6, 7), + new QuotationMarkStringMatch(textSegment, 10, 11), + new QuotationMarkStringMatch(textSegment, 13, 14), + new QuotationMarkStringMatch(textSegment, 20, 21), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, textSegment, 6, 7), + new QuotationMarkMetadata("\u201c", 3, QuotationMarkDirection.Opening, textSegment, 10, 11), + new QuotationMarkMetadata("\u2018", 4, QuotationMarkDirection.Opening, textSegment, 13, 14), + ] + ) + ); + Assert.That( + standardEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual( + [QuotationMarkResolutionIssue.TooDeepNesting, QuotationMarkResolutionIssue.UnpairedQuotationMark,] + ) + ); + } + + [Test] + public void IncompatibleQuotationMarkIssue() + { + QuoteConvention standardEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_english") + ); + Assert.IsNotNull(standardEnglishQuoteConvention); + var standardEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + var standardEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardEnglishResolverSettings); + + TextSegment textSegment = new TextSegment.Builder().SetText("\u201cThis is a \u201cquote\u201d\u201d").Build(); + Assert.That( + standardEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 11, 12), + new QuotationMarkStringMatch(textSegment, 17, 18), + new QuotationMarkStringMatch(textSegment, 18, 19), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.Opening, textSegment, 11, 12), + new QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.Closing, textSegment, 17, 18), + new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegment, 18, 19), + ] + ) + ); + Assert.That( + standardEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.IncompatibleQuotationMark]) + ); + } + + [Test] + public void AmbiguousQuotationMarkIssue() + { + QuoteConvention typewriterEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("typewriter_english") + ); + Assert.IsNotNull(typewriterEnglishQuoteConvention); + var typewriterEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + var typewriterEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver( + typewriterEnglishResolverSettings + ); + + TextSegment textSegment = new TextSegment.Builder().SetText("This\"is an ambiguous quotation mark").Build(); + Assert.That( + typewriterEnglishQuotationMarkResolver + .ResolveQuotationMarks([new QuotationMarkStringMatch(textSegment, 4, 5),]) + .Count(), + Is.EqualTo(0) + ); + Assert.That( + typewriterEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.AmbiguousQuotationMark]) + ); + + typewriterEnglishQuotationMarkResolver.Reset(); + textSegment = new TextSegment.Builder().SetText("\u201cThis is an ambiguous quotation mark").Build(); + Assert.That( + typewriterEnglishQuotationMarkResolver + .ResolveQuotationMarks([new QuotationMarkStringMatch(textSegment, 0, 1)]) + .Count(), + Is.EqualTo(0) + ); + Assert.That( + typewriterEnglishQuotationMarkResolver + .GetIssues() + .SequenceEqual([QuotationMarkResolutionIssue.AmbiguousQuotationMark]) + ); + } + + [Test] + public void TypewriterEnglishQuotationMarkRecognition() + { + QuoteConvention typewriterEnglishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("typewriter_english") + ); + Assert.IsNotNull(typewriterEnglishQuoteConvention); + var typewriterEnglishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + var typewriterEnglishQuotationMarkResolver = new DepthBasedQuotationMarkResolver( + typewriterEnglishResolverSettings + ); + + TextSegment textSegment = ( + new TextSegment.Builder() + .SetText("\"This is a 'quote'\"") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ); + Assert.That( + typewriterEnglishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 11, 12), + new QuotationMarkStringMatch(textSegment, 17, 18), + new QuotationMarkStringMatch(textSegment, 18, 19), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\"", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("'", 2, QuotationMarkDirection.Opening, textSegment, 11, 12), + new QuotationMarkMetadata("'", 2, QuotationMarkDirection.Closing, textSegment, 17, 18), + new QuotationMarkMetadata("\"", 1, QuotationMarkDirection.Closing, textSegment, 18, 19), + ] + ) + ); + Assert.That(typewriterEnglishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void TypewriterFrenchMarkRecognition() + { + QuoteConvention typewriterFrenchQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("typewriter_french") + ); + Assert.IsNotNull(typewriterFrenchQuoteConvention); + var typewriterFrenchResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([typewriterFrenchQuoteConvention]) + ); + var typewriterFrenchQuotationMarkResolver = new DepthBasedQuotationMarkResolver( + typewriterFrenchResolverSettings + ); + + TextSegment textSegment = new TextSegment.Builder().SetText("<>>").Build(); + Assert.That( + typewriterFrenchQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 2), + new QuotationMarkStringMatch(textSegment, 12, 13), + new QuotationMarkStringMatch(textSegment, 18, 19), + new QuotationMarkStringMatch(textSegment, 19, 21), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("<<", 1, QuotationMarkDirection.Opening, textSegment, 0, 2), + new QuotationMarkMetadata("<", 2, QuotationMarkDirection.Opening, textSegment, 12, 13), + new QuotationMarkMetadata(">", 2, QuotationMarkDirection.Closing, textSegment, 18, 19), + new QuotationMarkMetadata(">>", 1, QuotationMarkDirection.Closing, textSegment, 19, 21), + ] + ) + ); + Assert.That(typewriterFrenchQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void CentralEuropeanQuotationMarkRecognition() + { + QuoteConvention centralEuropeanQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("central_european") + ); + Assert.IsNotNull(centralEuropeanQuoteConvention); + var centralEuropeanResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([centralEuropeanQuoteConvention]) + ); + var centralEuropeanQuotationMarkResolver = new DepthBasedQuotationMarkResolver(centralEuropeanResolverSettings); + + TextSegment textSegment = ( + new TextSegment.Builder() + .SetText("\u201eThis is a \u201aquote\u2018\u201c") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ); + Assert.That( + centralEuropeanQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 11, 12), + new QuotationMarkStringMatch(textSegment, 17, 18), + new QuotationMarkStringMatch(textSegment, 18, 19), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201e", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("\u201a", 2, QuotationMarkDirection.Opening, textSegment, 11, 12), + new QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Closing, textSegment, 17, 18), + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Closing, textSegment, 18, 19), + ] + ) + ); + Assert.That(centralEuropeanQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void StandardSwedishQuotationMarkRecognition() + { + QuoteConvention standardSwedishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish") + ); + Assert.IsNotNull(standardSwedishQuoteConvention); + var standardSwedishResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet([standardSwedishQuoteConvention]) + ); + var standardSwedishQuotationMarkResolver = new DepthBasedQuotationMarkResolver(standardSwedishResolverSettings); + + TextSegment textSegment = ( + new TextSegment.Builder() + .SetText("\u201dThis is a \u2019quote\u2019\u201d") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ); + Assert.That( + standardSwedishQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 11, 12), + new QuotationMarkStringMatch(textSegment, 17, 18), + new QuotationMarkStringMatch(textSegment, 18, 19), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Opening, textSegment, 11, 12), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, textSegment, 17, 18), + new QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, textSegment, 18, 19), + ] + ) + ); + Assert.That(standardSwedishQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + [Test] + public void MultipleConventionsQuotationMarkRecognition() + { + QuoteConvention typewriterFrenchQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( + "typewriter_french" + ); + + Assert.IsNotNull(typewriterFrenchQuoteConvention); + + QuoteConvention centralEuropeanQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("central_european") + ); + Assert.IsNotNull(centralEuropeanQuoteConvention); + + QuoteConvention standardSwedishQuoteConvention = ( + QuoteConventions.Standard.GetQuoteConventionByName("standard_swedish") + ); + Assert.IsNotNull(standardSwedishQuoteConvention); + var multipleConventionsResolverSettings = new QuoteConventionDetectionResolutionSettings( + new QuoteConventionSet( + [typewriterFrenchQuoteConvention, centralEuropeanQuoteConvention, standardSwedishQuoteConvention] + ) + ); + var multipleConventionsQuotationMarkResolver = new DepthBasedQuotationMarkResolver( + multipleConventionsResolverSettings + ); + + TextSegment textSegment = ( + new TextSegment.Builder() + .SetText("\u201eThis is a \u2019quote>\u201c") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ); + Assert.That( + multipleConventionsQuotationMarkResolver + .ResolveQuotationMarks( + [ + new QuotationMarkStringMatch(textSegment, 0, 1), + new QuotationMarkStringMatch(textSegment, 11, 12), + new QuotationMarkStringMatch(textSegment, 17, 18), + new QuotationMarkStringMatch(textSegment, 18, 19), + ] + ) + .SequenceEqual( + [ + new QuotationMarkMetadata("\u201e", 1, QuotationMarkDirection.Opening, textSegment, 0, 1), + new QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Opening, textSegment, 11, 12), + new QuotationMarkMetadata(">", 2, QuotationMarkDirection.Closing, textSegment, 17, 18), + new QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Closing, textSegment, 18, 19), + ] + ) + ); + Assert.That(multipleConventionsQuotationMarkResolver.GetIssues(), Has.Count.EqualTo(0)); + } + + private class TestQuoteContinuerState : QuoteContinuerState + { + public QuoteContinuerStyle InternalContinuerStyle + { + get => ContinuerStyle; + set => ContinuerStyle = value; + } + } +} diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs new file mode 100644 index 00000000..29d04e55 --- /dev/null +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzerTests.cs @@ -0,0 +1,1179 @@ +using NUnit.Framework; + +namespace SIL.Machine.PunctuationAnalysis; + +[TestFixture] +public class PreliminaryQuotationMarkAnalyzerTests +{ + # region ApostropheProportionStatistics + [Test] + public void ApostropheProportionStatisticsReset() + { + var apostropheProportionStatistics = new ApostropheProportionStatistics(); + apostropheProportionStatistics.CountCharacters(new TextSegment.Builder().SetText("'").Build()); + apostropheProportionStatistics.AddApostrophe(); + Assert.IsTrue(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.5)); + + apostropheProportionStatistics.Reset(); + Assert.IsFalse(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.5)); + } + + [Test] + public void IsApostropheProportionGreaterThan() + { + var apostropheProportionStatistics = new ApostropheProportionStatistics(); + Assert.IsFalse(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.0)); + + // invalid case where no characters have been counted + apostropheProportionStatistics.AddApostrophe(); + Assert.IsFalse(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.0)); + + apostropheProportionStatistics.CountCharacters(new TextSegment.Builder().SetText("a").Build()); + Assert.IsTrue(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.99)); + + apostropheProportionStatistics.AddApostrophe(); + apostropheProportionStatistics.CountCharacters(new TextSegment.Builder().SetText("bcd").Build()); + Assert.IsTrue(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.4)); + Assert.IsFalse(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.5)); + + apostropheProportionStatistics.CountCharacters(new TextSegment.Builder().SetText("ef").Build()); + Assert.IsTrue(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.3)); + Assert.IsFalse(apostropheProportionStatistics.IsApostropheProportionGreaterThan(0.4)); + } + + #endregion + #region QuotationMarkWordPosition + + [Test] + public void IsMarkRarelyInitial() + { + var quotationMarkWordPositions = new QuotationMarkWordPositions(); + Assert.IsFalse(quotationMarkWordPositions.IsMarkRarelyInitial("\u201d")); + + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkRarelyInitial("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + Assert.IsFalse(quotationMarkWordPositions.IsMarkRarelyInitial("\u201d")); + + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkRarelyInitial("\u201d")); + + quotationMarkWordPositions.CountWordFinalApostrophe("\u201c"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkRarelyInitial("\u201d")); + + quotationMarkWordPositions.CountWordFinalApostrophe("\u201c"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201c"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkRarelyInitial("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + Assert.IsFalse(quotationMarkWordPositions.IsMarkRarelyInitial("\u201d")); + } + + [Test] + public void IsMarkRarelyFinal() + { + var quotationMarkWordPositions = new QuotationMarkWordPositions(); + Assert.IsFalse(quotationMarkWordPositions.IsMarkRarelyFinal("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkRarelyFinal("\u201d")); + + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + Assert.IsFalse(quotationMarkWordPositions.IsMarkRarelyFinal("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkRarelyFinal("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201c"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkRarelyFinal("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201c"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201c"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkRarelyFinal("\u201d")); + + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + Assert.IsFalse(quotationMarkWordPositions.IsMarkRarelyFinal("\u201d")); + } + + [Test] + public void AreInitialAndFinalRatesSimilar() + { + var quotationMarkWordPositions = new QuotationMarkWordPositions(); + Assert.IsFalse(quotationMarkWordPositions.AreInitialAndFinalRatesSimilar("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + Assert.IsTrue(quotationMarkWordPositions.AreInitialAndFinalRatesSimilar("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + Assert.IsFalse(quotationMarkWordPositions.AreInitialAndFinalRatesSimilar("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + Assert.IsTrue(quotationMarkWordPositions.AreInitialAndFinalRatesSimilar("\u201d")); + + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + Assert.IsFalse(quotationMarkWordPositions.AreInitialAndFinalRatesSimilar("\u201d")); + + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + Assert.IsTrue(quotationMarkWordPositions.AreInitialAndFinalRatesSimilar("\u201d")); + } + + [Test] + public void IsMarkCommonlyMidWord() + { + var quotationMarkWordPositions = new QuotationMarkWordPositions(); + Assert.IsFalse(quotationMarkWordPositions.IsMarkCommonlyMidWord("'")); + + quotationMarkWordPositions.CountMidWordApostrophe("'"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkCommonlyMidWord("'")); + + quotationMarkWordPositions.CountWordInitialApostrophe("'"); + quotationMarkWordPositions.CountWordFinalApostrophe("'"); + quotationMarkWordPositions.CountWordInitialApostrophe("'"); + quotationMarkWordPositions.CountWordFinalApostrophe("'"); + Assert.IsFalse(quotationMarkWordPositions.IsMarkCommonlyMidWord("'")); + + quotationMarkWordPositions.CountMidWordApostrophe("'"); + Assert.IsTrue(quotationMarkWordPositions.IsMarkCommonlyMidWord("'")); + } + + [Test] + public void QuotationMarkWordPositionsReset() + { + var quotationMarkWordPositions = new QuotationMarkWordPositions(); + quotationMarkWordPositions.CountWordInitialApostrophe("\u201d"); + quotationMarkWordPositions.CountWordFinalApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + quotationMarkWordPositions.CountMidWordApostrophe("\u201d"); + + Assert.IsTrue(quotationMarkWordPositions.IsMarkCommonlyMidWord("\u201d")); + + quotationMarkWordPositions.Reset(); + + Assert.IsFalse(quotationMarkWordPositions.IsMarkCommonlyMidWord("\u201d")); + } + #endregion + #region QuotationMarkSequence + + [Test] + public void IsMarkMuchMoreCommonEarlier() + { + var quotationMarkSequences = new QuotationMarkSequences(); + Assert.IsFalse(quotationMarkSequences.IsMarkMuchMoreCommonEarlier("\"")); + + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + Assert.IsTrue(quotationMarkSequences.IsMarkMuchMoreCommonEarlier("\"")); + + quotationMarkSequences.CountLaterQuotationMark("\""); + Assert.IsFalse(quotationMarkSequences.IsMarkMuchMoreCommonEarlier("\"")); + + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + Assert.IsTrue(quotationMarkSequences.IsMarkMuchMoreCommonEarlier("\"")); + + quotationMarkSequences.CountLaterQuotationMark("\""); + Assert.IsFalse(quotationMarkSequences.IsMarkMuchMoreCommonEarlier("\"")); + } + + [Test] + public void IsMarkMuchMoreCommonLater() + { + var quotationMarkSequences = new QuotationMarkSequences(); + Assert.IsFalse(quotationMarkSequences.IsMarkMuchMoreCommonLater("\"")); + + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + Assert.IsTrue(quotationMarkSequences.IsMarkMuchMoreCommonLater("\"")); + + quotationMarkSequences.CountEarlierQuotationMark("\""); + Assert.IsFalse(quotationMarkSequences.IsMarkMuchMoreCommonLater("\"")); + + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + Assert.IsTrue(quotationMarkSequences.IsMarkMuchMoreCommonLater("\"")); + + quotationMarkSequences.CountEarlierQuotationMark("\""); + Assert.IsFalse(quotationMarkSequences.IsMarkMuchMoreCommonLater("\"")); + } + + [Test] + public void IsMarkCommonEarlyAndLate() + { + var quotationMarkSequences = new QuotationMarkSequences(); + Assert.IsFalse(quotationMarkSequences.AreEarlyAndLateMarkRatesSimilar("\"")); + + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + Assert.IsTrue(quotationMarkSequences.AreEarlyAndLateMarkRatesSimilar("\"")); + + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountEarlierQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + Assert.IsTrue(quotationMarkSequences.AreEarlyAndLateMarkRatesSimilar("\"")); + + quotationMarkSequences.CountLaterQuotationMark("\""); + Assert.IsTrue(quotationMarkSequences.AreEarlyAndLateMarkRatesSimilar("\"")); + + quotationMarkSequences.CountLaterQuotationMark("\""); + quotationMarkSequences.CountLaterQuotationMark("\""); + Assert.IsFalse(quotationMarkSequences.AreEarlyAndLateMarkRatesSimilar("\"")); + } + + #endregion + #region QuotationMarkGrouper + + [Test] + public void GetQuotationMarkPairs() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var typewriterEnglishQuoteConvention = new QuoteConvention( + "typewriter_english", + [ + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + ] + ); + + var quotationMarkGrouper = new QuotationMarkGrouper( + [], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([])); + + // no paired quotation mark + quotationMarkGrouper = new QuotationMarkGrouper( + [new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1)], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([])); + + // basic quotation mark pair + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 1, 2), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([("\u201c", "\u201d")])); + + // out-of-order quotation mark pair + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d\u201c").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201d\u201c").Build(), 1, 2), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([])); + + // multiple unpaired quotation marks + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u2019").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u2019").Build(), 1, 2), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([])); + + // paired and unpaired quotation marks + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u2018\u201d").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u2018\u201d").Build(), 1, 2), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u2018\u201d").Build(), 2, 3), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([("\u201c", "\u201d")])); + + // ambiguous unpaired quotation mark + quotationMarkGrouper = new QuotationMarkGrouper( + [new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"").Build(), 0, 1)], + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([])); + + // paired ambiguous quotation marks + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"\"").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"\"").Build(), 1, 2), + ], + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([("\"", "\"")])); + + // multiple paired quotation marks (should be skipped because we don't know how to pair them) + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d\u201c\u201d").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d\u201c\u201d").Build(), + 1, + 2 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d\u201c\u201d").Build(), + 2, + 3 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d\u201c\u201d").Build(), + 3, + 4 + ), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([])); + + // multiple different paired quotation marks + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d\u2018\u2019").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d\u2018\u2019").Build(), + 1, + 2 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d\u2018\u2019").Build(), + 2, + 3 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d\u2018\u2019").Build(), + 3, + 4 + ), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That( + quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([("\u201c", "\u201d"), ("\u2018", "\u2019")]) + ); + + // second-level paired quotation marks + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018\u2019").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018\u2019").Build(), 1, 2), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([("\u2018", "\u2019")])); + + // quotation marks that don't match the convention set + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 1, 2), + ], + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + Assert.That(quotationMarkGrouper.GetQuotationMarkPairs().SequenceEqual([])); + } + + [Test] + public void HasDistinctPairedQuotationMarks() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var typewriterEnglishQuoteConvention = new QuoteConvention( + "typewriter_english", + [ + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + ] + ); + + var quotationMarkGrouper = new QuotationMarkGrouper( + [], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.IsFalse(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u201c")); + Assert.IsFalse(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u201d")); + Assert.IsFalse(quotationMarkGrouper.HasDistinctPairedQuotationMark("")); + + // basic paired quotation marks + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 1, 2), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.IsTrue(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u201c")); + Assert.IsTrue(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u201d")); + + // second-level paired quotation marks + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018\u2019").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u2018\u2019").Build(), 1, 2), + ], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.IsTrue(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u2018")); + Assert.IsTrue(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u2019")); + + // only one half of the pair observed + quotationMarkGrouper = new QuotationMarkGrouper( + [new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c").Build(), 0, 1),], + new QuoteConventionSet([standardEnglishQuoteConvention]) + ); + Assert.IsFalse(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u201c")); + Assert.IsTrue(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u201d")); + + // quotation marks that don't match the convention set + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\u201c\u201d").Build(), 1, 2), + ], + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + Assert.IsFalse(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u201c")); + Assert.IsFalse(quotationMarkGrouper.HasDistinctPairedQuotationMark("\u201d")); + + // ambiguous quotation marks + quotationMarkGrouper = new QuotationMarkGrouper( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"\"").Build(), 0, 1), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"\"").Build(), 1, 2), + ], + new QuoteConventionSet([typewriterEnglishQuoteConvention]) + ); + Assert.IsFalse(quotationMarkGrouper.HasDistinctPairedQuotationMark("\"")); + } + + #endregion + #region PreliminaryApostropheAnalyzer + + [Test] + public void ThatTheMarkMustBeAnApostrophe() + { + var preliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + preliminaryApostropheAnalyzer.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + ], + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("alternative mid\u2019word apostrophe").Build(), + 15, + 16 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("mid\u2018word quotation mark").Build(), + 3, + 4 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("mid\u201cword quotation mark").Build(), + 3, + 4 + ), + ] + ); + Assert.IsTrue(preliminaryApostropheAnalyzer.IsApostropheOnly("'")); + Assert.IsTrue(preliminaryApostropheAnalyzer.IsApostropheOnly("\u2019")); + Assert.IsFalse(preliminaryApostropheAnalyzer.IsApostropheOnly("\u2018")); + Assert.IsFalse(preliminaryApostropheAnalyzer.IsApostropheOnly("\u201c")); + Assert.IsFalse(preliminaryApostropheAnalyzer.IsApostropheOnly("\u201d")); + } + + [Test] + public void ThatARarelyInitialOrFinalMarkIsAnApostrophe() + { + var negativePreliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + negativePreliminaryApostropheAnalyzer.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + ], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + ] + ); + Assert.IsFalse(negativePreliminaryApostropheAnalyzer.IsApostropheOnly("'")); + + var positivePreliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + positivePreliminaryApostropheAnalyzer.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "The proportion must be kept below 0.02, because quotation marks should occur relatively infrequently" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "Apostrophes, on the other hand, can be much more common, especially in non-English languages where they " + + "can indicate a glottal stop" + ) + .Build(), + new TextSegment.Builder() + .SetText("Technically Unicode has a separate character for the glottal stop, but it is rarely used") + .Build(), + ], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + ] + ); + Assert.IsTrue(positivePreliminaryApostropheAnalyzer.IsApostropheOnly("'")); + } + + [Test] + public void ThatAMarkWithSimilarFinalAndInitialRatesIsAnApostrophe() + { + var negativePreliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + negativePreliminaryApostropheAnalyzer.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "We need a ton of text here to keep the proportion low, since we have 8 apostrophes in this test" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "The proportion must be kept below 0.02, because quotation marks should occur relatively infrequently" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "Apostrophes, on the other hand, can be much more common, especially in non-English languages where they " + + "can indicate a glottal stop" + ) + .Build(), + ], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + ] + ); + Assert.IsFalse(negativePreliminaryApostropheAnalyzer.IsApostropheOnly("'")); + + var negativePreliminaryApostropheAnalyzer2 = new PreliminaryApostropheAnalyzer(); + negativePreliminaryApostropheAnalyzer2.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "We need a ton of text here to keep the proportion low, since we have 8 apostrophes in this test" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "The proportion must be kept below 0.02, because quotation marks should occur relatively infrequently" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "Apostrophes, on the other hand, can be much more common, especially in non-English languages where they " + + "can indicate a glottal stop" + ) + .Build(), + ], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + ] + ); + Assert.IsFalse(negativePreliminaryApostropheAnalyzer2.IsApostropheOnly("'")); + + var positivePreliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + positivePreliminaryApostropheAnalyzer.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "We need a ton of text here to keep the proportion low, since we have 8 apostrophes in this test" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "The proportion must be kept below 0.02, because quotation marks should occur relatively infrequently" + ) + .Build(), + new TextSegment.Builder() + .SetText( + "Apostrophes, on the other hand, can be much more common, especially in non-English languages where they " + + "can indicate a glottal stop" + ) + .Build(), + ], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + ] + ); + Assert.IsTrue(positivePreliminaryApostropheAnalyzer.IsApostropheOnly("'")); + } + + [Test] + public void ThatACommonlyMidWordMarkIsAnApostrophe() + { + var negativePreliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + negativePreliminaryApostropheAnalyzer.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + ], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + ] + ); + Assert.IsFalse(negativePreliminaryApostropheAnalyzer.IsApostropheOnly("'")); + + var positivePreliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + positivePreliminaryApostropheAnalyzer.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + ], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("mid'word apostrophe").Build(), 3, 4), + ] + ); + Assert.IsTrue(positivePreliminaryApostropheAnalyzer.IsApostropheOnly("'")); + } + + [Test] + public void ThatAFrequentlyOccurringCharacterIsAnApostrophe() + { + var negativePreliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + negativePreliminaryApostropheAnalyzer.ProcessQuotationMarks( + [ + new TextSegment.Builder() + .SetText("Long text segment to help keep the proportion of apostrophes low") + .Build(), + new TextSegment.Builder() + .SetText( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .Build(), + ], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + ] + ); + Assert.IsFalse(negativePreliminaryApostropheAnalyzer.IsApostropheOnly("'")); + + var positivePreliminaryApostropheAnalyzer = new PreliminaryApostropheAnalyzer(); + positivePreliminaryApostropheAnalyzer.ProcessQuotationMarks( + [new TextSegment.Builder().SetText("Very short text").Build(),], + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("'word initial apostrophe").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("word' final apostrophe").Build(), 4, 5), + ] + ); + Assert.IsTrue(positivePreliminaryApostropheAnalyzer.IsApostropheOnly("'")); + } + + #endregion + #region PreliminaryQuotationMarkAnalyzer + + [Test] + public void ThatQuotationMarkSequenceIsUsedToDetermineOpeningAndClosingQuotes() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var typewriterEnglishQuoteConvention = new QuoteConvention( + "typewriter_english", + [ + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + ] + ); + var standardFrenchQuoteConvention = new QuoteConvention( + "standard_french", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + ] + ); + + var westernEuropeanQuoteConvention = new QuoteConvention( + "western_european", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var preliminaryQuotationAnalyzer = new PreliminaryQuotationMarkAnalyzer( + new QuoteConventionSet( + [ + standardEnglishQuoteConvention, + typewriterEnglishQuoteConvention, + standardFrenchQuoteConvention, + westernEuropeanQuoteConvention, + standardSwedishQuoteConvention, + ] + ) + ); + + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("initial text \u201c quoted English text \u201d final text") + .Build() + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([standardEnglishQuoteConvention])) + ); + + preliminaryQuotationAnalyzer.Reset(); + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("initial text \u201d quoted Swedish text \u201d final text") + .Build(), + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([standardSwedishQuoteConvention])) + ); + + preliminaryQuotationAnalyzer.Reset(); + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText( + "initial text \u00ab quoted French/Western European text \u00bb final text" + ) + .Build(), + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([standardFrenchQuoteConvention, westernEuropeanQuoteConvention])) + ); + + preliminaryQuotationAnalyzer.Reset(); + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("initial text \" quoted typewriter English text \" final text") + .Build(), + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([typewriterEnglishQuoteConvention])) + ); + + preliminaryQuotationAnalyzer.Reset(); + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("initial text \u201c quoted English text \u201d final text") + .Build(), + new TextSegment.Builder() + .SetText("second level \u2018 English quotes \u2019") + .Build(), + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([standardEnglishQuoteConvention])) + ); + + preliminaryQuotationAnalyzer.Reset(); + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("initial text \" quoted typewriter English text \" final text") + .Build(), + new TextSegment.Builder().SetText("second level 'typewriter quotes'").Build(), + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([typewriterEnglishQuoteConvention])) + ); + + preliminaryQuotationAnalyzer.Reset(); + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("initial text \u201c quoted English text \u201d final text") + .Build(), + new TextSegment.Builder() + .SetText("the quotes \u201d in this segment \u201c are backwards") + .Build(), + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([])) + ); + + preliminaryQuotationAnalyzer.Reset(); + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText( + "first-level quotes \u2018 must be observed \u2019 to retain a quote convention" + ) + .Build(), + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([])) + ); + } + + [Test] + public void ThatApostrophesNotConsideredAsQuotationMarks() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var typewriterEnglishQuoteConvention = new QuoteConvention( + "typewriter_english", + [ + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + ] + ); + + var preliminaryQuotationAnalyzer = new PreliminaryQuotationMarkAnalyzer( + new QuoteConventionSet([standardEnglishQuoteConvention, typewriterEnglishQuoteConvention,]) + ); + + Assert.That( + preliminaryQuotationAnalyzer.NarrowDownPossibleQuoteConventions( + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("ini'tial 'text \u201c quo'ted English text' \u201d fi'nal text") + .Build() + ] + ) + ] + ) + ] + ), + Is.EqualTo(new QuoteConventionSet([standardEnglishQuoteConvention])) + ); + } + #endregion +} diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs new file mode 100644 index 00000000..8b34a377 --- /dev/null +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs @@ -0,0 +1,373 @@ +using NUnit.Framework; +using SIL.Machine.Corpora; + +namespace SIL.Machine.PunctuationAnalysis; + +[TestFixture] +public class QuotationConventionDetectorTests +{ + // Text comes from the World English Bible, which is in the public domain. + [Test] + public void StandardEnglish() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_english")); + } + + [Test] + public void TypewriterEnglish() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + 'You shall not eat of any tree of the garden'?\"" + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("typewriter_english")); + } + + [Test] + public void BritishEnglish() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ‘Has God really said, + “You shall not eat of any tree of the garden”?’ + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("british_english")); + } + + [Test] + public void BritishTypewriterEnglish() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, 'Has God really said, + ""You shall not eat of any tree of the garden""?' + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("british_typewriter_english")); + } + + [Test] + public void HybridTypewriterEnglish() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + 'You shall not eat of any tree of the garden'?” + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("hybrid_typewriter_english")); + } + + [Test] + public void StandardFrench() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + ‹You shall not eat of any tree of the garden›?» + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_french")); + } + + [Test] + public void TypewriterFrench() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <?>> + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("typewriter_french")); + } + + // frenchVariant requires a 3rd-level of quotes to differentiate from standardFrench + [Test] + public void WesternEuropean() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + “You shall not eat of any tree of the garden”?» + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("western_european")); + } + + [Test] + public void BritishInspiredWesternEuropean() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + ‘You shall not eat of any tree of the garden’?» + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("british_inspired_western_european")); + } + + [Test] + public void TypewriterWesternEuropean() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <> + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("typewriter_western_european")); + } + + [Test] + public void TypewriterWesternEuropeanVariant() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + ?"" + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("typewriter_western_european_variant")); + } + + [Test] + public void HybridTypewriterWesternEuropean() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + ""You shall not eat of any tree of the garden""?» + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("hybrid_typewriter_western_european")); + } + + [Test] + public void HybridBritishTypewriterWesternEuropean() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + 'You shall not eat of any tree of the garden'?» + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("hybrid_british_typewriter_western_european")); + } + + [Test] + public void CentralEuropean() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, „Has God really said, + ‚You shall not eat of any tree of the garden‘?“ + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("central_european")); + } + + [Test] + public void CentralEuropeanGuillemets() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, »Has God really said, + ›You shall not eat of any tree of the garden‹?« + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("central_european_guillemets")); + } + + [Test] + public void StandardSwedish() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ”Has God really said, + ’You shall not eat of any tree of the garden’?” + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_swedish")); + } + + [Test] + public void StandardFinnish() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, »Has God really said, + ’You shall not eat of any tree of the garden’?» + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_finnish")); + } + + [Test] + public void EasternEuropean() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, „Has God really said, + ‚You shall not eat of any tree of the garden’?” + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("eastern_european")); + } + + [Test] + public void StandardRussian() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + „You shall not eat of any tree of the garden“?» + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_russian")); + } + + [Test] + public void StandardArabic() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ”Has God really said, + ’You shall not eat of any tree of the garden‘?“ + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_arabic")); + } + + [Test] + public void NonStandardArabic() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + ’You shall not eat of any tree of the garden‘?» + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("non-standard_arabic")); + } + + [Test] + public void MismatchedQuotationMarks() + { + string usfm = + @" +\c 1 +\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + \\v 2 The woman said to the serpent, + “We may eat fruit from the trees of the garden, + \\v 3 but not the fruit of the tree which is in the middle of the garden. + God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’ + "; + QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); + Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_english")); + } + + public QuoteConventionAnalysis DetectQuotationConvention(string usfm) + { + var quoteConventionDetector = new QuoteConventionDetector(); + UsfmParser.Parse(usfm, quoteConventionDetector); + return quoteConventionDetector.DetectQuotationConvention(); + } +} diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs new file mode 100644 index 00000000..0c70b746 --- /dev/null +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs @@ -0,0 +1,428 @@ +using NUnit.Framework; + +namespace SIL.Machine.PunctuationAnalysis; + +[TestFixture] +public class QuotationMarkFinderTests +{ + [Test] + public void ThatAllPossibleQuotationMarksAreIdentified() + { + var quotationMarkFinder = new QuotationMarkFinder(QuoteConventions.Standard); + Assert.That( + quotationMarkFinder + .FindAllPotentialQuotationMarksInTextSegment( + new TextSegment.Builder().SetText("\u201cSample Text\u201d").Build() + ) + .SequenceEqual( + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201cSample Text\u201d").Build(), + 0, + 1 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201cSample Text\u201d").Build(), + 12, + 13 + ), + ] + ) + ); + + Assert.That( + quotationMarkFinder + .FindAllPotentialQuotationMarksInTextSegment( + new TextSegment.Builder().SetText("\"Sample Text'").Build() + ) + .SequenceEqual( + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("\"Sample Text'").Build(), 0, 1), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\"Sample Text'").Build(), + 12, + 13 + ), + ] + ) + ); + + Assert.That( + quotationMarkFinder + .FindAllPotentialQuotationMarksInTextSegment( + new TextSegment.Builder().SetText("All \u201cthe \u2019English quotation\u2018 marks\u201d").Build() + ) + .SequenceEqual( + [ + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("All \u201cthe \u2019English quotation\u2018 marks\u201d") + .Build(), + 4, + 5 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("All \u201cthe \u2019English quotation\u2018 marks\u201d") + .Build(), + 9, + 10 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("All \u201cthe \u2019English quotation\u2018 marks\u201d") + .Build(), + 27, + 28 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("All \u201cthe \u2019English quotation\u2018 marks\u201d") + .Build(), + 34, + 35 + ) + ] + ) + ); + + Assert.That( + quotationMarkFinder + .FindAllPotentialQuotationMarksInTextSegment( + new TextSegment.Builder().SetText("All \u00abthe \u2039French quotation\u203a marks\u00bb").Build() + ) + .SequenceEqual( + [ + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("All \u00abthe \u2039French quotation\u203a marks\u00bb") + .Build(), + 4, + 5 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("All \u00abthe \u2039French quotation\u203a marks\u00bb") + .Build(), + 9, + 10 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("All \u00abthe \u2039French quotation\u203a marks\u00bb") + .Build(), + 26, + 27 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("All \u00abthe \u2039French quotation\u203a marks\u00bb") + .Build(), + 33, + 34 + ), + ] + ) + ); + + Assert.That( + quotationMarkFinder + .FindAllPotentialQuotationMarksInTextSegment( + new TextSegment.Builder().SetText("All \"the 'typewriter quotation marks").Build() + ) + .SequenceEqual( + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("All \"the 'typewriter quotation marks").Build(), + 4, + 5 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("All \"the 'typewriter quotation marks").Build(), + 9, + 10 + ), + ] + ) + ); + + Assert.That( + quotationMarkFinder + .FindAllPotentialQuotationMarksInTextSegment( + new TextSegment.Builder() + .SetText("This has \u201equotes from \u00bbdifferent conventions < quotationMarkStringMatches = + [ + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("Opening “quote").Build(), 8, 9), + new QuotationMarkStringMatch(new TextSegment.Builder().SetText("Another opening ‘quote").Build(), 16, 17), + new QuotationMarkStringMatch( + new TextSegment.Builder() + .SetText("“‘quote continuer") + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + 0, + 1 + ) + ]; + + quotationMarkResolver.ResolveQuotationMarks(quotationMarkStringMatches).ToList(); + Assert.That(quotationMarkResolver.QuotationMarkResolverState.Quotations.Count(), Is.GreaterThan(0)); + Assert.IsTrue(quotationMarkResolver.QuotationMarkResolverState.CurrentDepth > 0); + + quotationMarkResolver.Reset(); + + Assert.That(quotationMarkResolver.QuotationMarkResolverState.Quotations.Count(), Is.EqualTo(0)); + Assert.That(quotationMarkResolver.QuoteContinuerState.QuoteContinuerMarks.Count(), Is.EqualTo(0)); + Assert.That(quotationMarkResolver.QuotationMarkResolverState.CurrentDepth, Is.EqualTo(0)); + Assert.That(quotationMarkResolver.QuoteContinuerState.CurrentDepth, Is.EqualTo(0)); + } +} diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs new file mode 100644 index 00000000..cac692e7 --- /dev/null +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs @@ -0,0 +1,746 @@ +using System.Text.RegularExpressions; +using NUnit.Framework; + +namespace SIL.Machine.PunctuationAnalysis; + +[TestFixture] +public class QuotationMarkStringMatchTests +{ + [Test] + public void GetQuotationMark() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("quick brown fox").Build(), + 6, + 7 + ); + Assert.That(quotationMarkStringMatch.QuotationMark, Is.EqualTo("b")); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("quick brown fox").Build(), + 6, + 10 + ); + Assert.That(quotationMarkStringMatch.QuotationMark, Is.EqualTo("brow")); + + quotationMarkStringMatch = new QuotationMarkStringMatch(new TextSegment.Builder().SetText("q").Build(), 0, 1); + Assert.That(quotationMarkStringMatch.QuotationMark, Is.EqualTo("q")); + } + + [Test] + public void IsValidOpeningQuotationMark() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standardEnglish", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c").Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.IsValidOpeningQuotationMark(standardEnglishQuoteConventionSet)); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201d").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.IsValidOpeningQuotationMark(standardEnglishQuoteConventionSet)); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201d\u201c").Build(), + 1, + 2 + ); + Assert.IsTrue(quotationMarkStringMatch.IsValidOpeningQuotationMark(standardEnglishQuoteConventionSet)); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201d\u201c").Build(), + 0, + 2 + ); + Assert.IsFalse(quotationMarkStringMatch.IsValidOpeningQuotationMark(standardEnglishQuoteConventionSet)); + } + + [Test] + public void IsValidClosingQuotationMark() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standardEnglish", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201d").Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.IsValidClosingQuotationMark(standardEnglishQuoteConventionSet)); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.IsValidClosingQuotationMark(standardEnglishQuoteConventionSet)); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201d\u201c").Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.IsValidClosingQuotationMark(standardEnglishQuoteConventionSet)); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201d\u201c").Build(), + 0, + 2 + ); + Assert.IsFalse(quotationMarkStringMatch.IsValidClosingQuotationMark(standardEnglishQuoteConventionSet)); + } + + [Test] + public void DoesQuotationMarkMatch() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.QuotationMarkMatches(new Regex(@"^s$", RegexOptions.Compiled))); + Assert.IsFalse(quotationMarkStringMatch.QuotationMarkMatches(new Regex(@"a", RegexOptions.Compiled))); + Assert.IsFalse(quotationMarkStringMatch.QuotationMarkMatches(new Regex(@"sa", RegexOptions.Compiled))); + } + + [Test] + public void DoesNextCharacterMatch() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.NextCharacterMatches(new Regex(@"^s$", RegexOptions.Compiled))); + Assert.IsTrue(quotationMarkStringMatch.NextCharacterMatches(new Regex(@"a", RegexOptions.Compiled))); + Assert.IsFalse(quotationMarkStringMatch.NextCharacterMatches(new Regex(@"sa", RegexOptions.Compiled))); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 10, + 11 + ); + Assert.IsFalse(quotationMarkStringMatch.NextCharacterMatches(new Regex(@".*", RegexOptions.Compiled))); + } + + [Test] + public void DoesPreviousCharacterMatch() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 1, + 2 + ); + Assert.IsTrue(quotationMarkStringMatch.PreviousCharacterMatches(new Regex(@"^s$", RegexOptions.Compiled))); + Assert.IsFalse(quotationMarkStringMatch.PreviousCharacterMatches(new Regex(@"a", RegexOptions.Compiled))); + Assert.IsFalse(quotationMarkStringMatch.PreviousCharacterMatches(new Regex(@"sa", RegexOptions.Compiled))); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.PreviousCharacterMatches(new Regex(@".*", RegexOptions.Compiled))); + } + + [Test] + public void GetPreviousCharacter() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 1, + 2 + ); + Assert.That(quotationMarkStringMatch.PreviousCharacter, Is.EqualTo("s")); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 10, + 11 + ); + Assert.That(quotationMarkStringMatch.PreviousCharacter, Is.EqualTo("x")); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + ; + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d").Build(), + 1, + 2 + ); + Assert.That(quotationMarkStringMatch.PreviousCharacter, Is.EqualTo("“")); + } + + [Test] + public void GetNextCharacter() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 1, + 2 + ); + Assert.That(quotationMarkStringMatch.NextCharacter, Is.EqualTo("m")); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.That(quotationMarkStringMatch.NextCharacter, Is.EqualTo("a")); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 10, + 11 + ); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d").Build(), + 0, + 1 + ); + Assert.That(quotationMarkStringMatch.NextCharacter, Is.EqualTo("”")); + } + + [Test] + public void DoesLeadingSubstringMatch() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 5, + 6 + ); + Assert.IsTrue(quotationMarkStringMatch.LeadingSubstringMatches(new Regex(@"^sampl$", RegexOptions.Compiled))); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.LeadingSubstringMatches(new Regex(@".+", RegexOptions.Compiled))); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d").Build(), + 1, + 2 + ); + Assert.IsTrue(quotationMarkStringMatch.LeadingSubstringMatches(new Regex(@"\u201c", RegexOptions.Compiled))); + } + + [Test] + public void DoesTrailingSubstringMatch() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 5, + 6 + ); + Assert.IsTrue(quotationMarkStringMatch.TrailingSubstringMatches(new Regex(@"^ text$", RegexOptions.Compiled))); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 10, + 11 + ); + Assert.IsFalse(quotationMarkStringMatch.TrailingSubstringMatches(new Regex(@".+", RegexOptions.Compiled))); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201c\u201d").Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.TrailingSubstringMatches(new Regex(@"\u201d", RegexOptions.Compiled))); + } + + [Test] + public void GetContext() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("this is a bunch' of sample text").Build(), + 15, + 16 + ); + Assert.That(quotationMarkStringMatch.Context, Is.EqualTo("is a bunch' of sample")); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("this is a bunch' of sample text").Build(), + 5, + 6 + ); + Assert.That(quotationMarkStringMatch.Context, Is.EqualTo("this is a bunch'")); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("this is a bunch' of sample text").Build(), + 25, + 26 + ); + Assert.That(quotationMarkStringMatch.Context, Is.EqualTo("' of sample text")); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("short").Build(), + 3, + 4 + ); + Assert.That(quotationMarkStringMatch.Context, Is.EqualTo("short")); + } + + [Test] + public void Resolve() + { + TextSegment textSegment = new TextSegment.Builder().SetText("'").Build(); + var quotationMarkStringMatch = new QuotationMarkStringMatch(textSegment, 0, 1); + Assert.That( + quotationMarkStringMatch.Resolve(2, QuotationMarkDirection.Opening), + Is.EqualTo(new QuotationMarkMetadata("'", 2, QuotationMarkDirection.Opening, textSegment, 0, 1)) + ); + Assert.That( + quotationMarkStringMatch.Resolve(1, QuotationMarkDirection.Opening), + Is.EqualTo(new QuotationMarkMetadata("'", 1, QuotationMarkDirection.Opening, textSegment, 0, 1)) + ); + Assert.That( + quotationMarkStringMatch.Resolve(1, QuotationMarkDirection.Closing), + Is.EqualTo(new QuotationMarkMetadata("'", 1, QuotationMarkDirection.Closing, textSegment, 0, 1)) + ); + } + + [Test] + public void IsAtStartOfSegment() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.IsAtStartOfSegment); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 1, + 2 + ); + Assert.IsFalse(quotationMarkStringMatch.IsAtStartOfSegment); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201csample text").Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.IsAtStartOfSegment); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 15, + 16 + ); + Assert.IsFalse(quotationMarkStringMatch.IsAtStartOfSegment); + } + + [Test] + public void IsAtEndOfSegment() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 10, + 11 + ); + Assert.IsTrue(quotationMarkStringMatch.IsAtEndOfSegment); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.IsAtEndOfSegment); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201csample text\u201d").Build(), + 12, + 13 + ); + Assert.IsTrue(quotationMarkStringMatch.IsAtEndOfSegment); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 15, + 16 + ); + Assert.IsFalse(quotationMarkStringMatch.IsAtEndOfSegment); + } + + [Test] + public void HasLeadingWhitespace() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 7, + 8 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample\ttext").Build(), + 7, + 8 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLeadingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").AddPrecedingMarker(UsfmMarkerType.Paragraph).Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").AddPrecedingMarker(UsfmMarkerType.Embed).Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").AddPrecedingMarker(UsfmMarkerType.Verse).Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").AddPrecedingMarker(UsfmMarkerType.Chapter).Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLeadingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").AddPrecedingMarker(UsfmMarkerType.Character).Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLeadingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201csample text").AddPrecedingMarker(UsfmMarkerType.Verse).Build(), + 0, + 1 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingWhitespace()); + } + + [Test] + public void HasTrailingWhitespace() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 5, + 6 + ); + Assert.IsTrue(quotationMarkStringMatch.HasTrailingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample\ttext").Build(), + 5, + 6 + ); + Assert.IsTrue(quotationMarkStringMatch.HasTrailingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 10, + 11 + ); + Assert.IsFalse(quotationMarkStringMatch.HasTrailingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").AddPrecedingMarker(UsfmMarkerType.Paragraph).Build(), + 10, + 11 + ); + Assert.IsFalse(quotationMarkStringMatch.HasTrailingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").AddPrecedingMarker(UsfmMarkerType.Embed).Build(), + 10, + 11 + ); + Assert.IsFalse(quotationMarkStringMatch.HasTrailingWhitespace()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").AddPrecedingMarker(UsfmMarkerType.Verse).Build(), + 10, + 11 + ); + Assert.IsFalse(quotationMarkStringMatch.HasTrailingWhitespace()); + } + + [Test] + public void HasLeadingPunctuation() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample)\u201d text").Build(), + 7, + 8 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingPunctuation()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample) \u201d text").Build(), + 8, + 9 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLeadingPunctuation()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample,\u201d text").Build(), + 7, + 8 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingPunctuation()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample.\u201d text").Build(), + 7, + 8 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingPunctuation()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\u201csample text").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLeadingPunctuation()); + } + + [Test] + public void HasTrailingPunctuation() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample \u201c-text").Build(), + 7, + 8 + ); + Assert.IsTrue(quotationMarkStringMatch.HasTrailingPunctuation()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample \u201c text").Build(), + 7, + 8 + ); + Assert.IsFalse(quotationMarkStringMatch.HasTrailingPunctuation()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text\u201d").Build(), + 11, + 12 + ); + Assert.IsFalse(quotationMarkStringMatch.HasTrailingPunctuation()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample', text\u201d").Build(), + 6, + 7 + ); + Assert.IsTrue(quotationMarkStringMatch.HasTrailingPunctuation()); + } + + [Test] + public void HasLetterInLeadingSubstring() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 1, + 2 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLetterInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("ꮪample text").Build(), + 1, + 2 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLetterInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLetterInLeadingSubstring()); + } + + [Test] + public void HasLetterInTrailingSubstring() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 9, + 10 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLetterInTrailingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample tex𑢼").Build(), + 9, + 10 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLetterInTrailingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 10, + 11 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLetterInTrailingSubstring()); + } + + [Test] + public void HasLeadingLatinLetter() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 1, + 2 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingLatinLetter()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("5ample text").Build(), + 1, + 2 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLeadingLatinLetter()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("Sample text").Build(), + 1, + 2 + ); + Assert.IsTrue(quotationMarkStringMatch.HasLeadingLatinLetter()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 0, + 1 + ); + Assert.IsFalse(quotationMarkStringMatch.HasLeadingLatinLetter()); + } + + [Test] + public void HasTrailingLatinLetter() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 9, + 10 + ); + Assert.IsTrue(quotationMarkStringMatch.HasTrailingLatinLetter()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample texT").Build(), + 9, + 10 + ); + Assert.IsTrue(quotationMarkStringMatch.HasTrailingLatinLetter()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample text").Build(), + 10, + 11 + ); + Assert.IsFalse(quotationMarkStringMatch.HasTrailingLatinLetter()); + } + + [Test] + public void HasQuoteIntroducerInLeadingSubstring() + { + var quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample, \u201ctext").Build(), + 8, + 9 + ); + Assert.IsTrue(quotationMarkStringMatch.HasQuoteIntroducerInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample,\u201ctext").Build(), + 7, + 8 + ); + Assert.IsTrue(quotationMarkStringMatch.HasQuoteIntroducerInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample: \u201ctext").Build(), + 8, + 9 + ); + Assert.IsTrue(quotationMarkStringMatch.HasQuoteIntroducerInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample:\u201ctext").Build(), + 7, + 8 + ); + Assert.IsTrue(quotationMarkStringMatch.HasQuoteIntroducerInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample, \u201ctext").Build(), + 9, + 10 + ); + Assert.IsTrue(quotationMarkStringMatch.HasQuoteIntroducerInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample,, \u201ctext").Build(), + 9, + 10 + ); + Assert.IsTrue(quotationMarkStringMatch.HasQuoteIntroducerInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample, a \u201ctext").Build(), + 10, + 11 + ); + Assert.IsFalse(quotationMarkStringMatch.HasQuoteIntroducerInLeadingSubstring()); + + quotationMarkStringMatch = new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("sample, text").Build(), + 8, + 9 + ); + Assert.IsTrue(quotationMarkStringMatch.HasQuoteIntroducerInLeadingSubstring()); + } +} diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkTabulatorTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkTabulatorTests.cs new file mode 100644 index 00000000..c3daec46 --- /dev/null +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkTabulatorTests.cs @@ -0,0 +1,211 @@ +using NUnit.Framework; + +namespace SIL.Machine.PunctuationAnalysis; + +[TestFixture] +public class QuotationMarkTabulatorTests +{ + [Test] + public void GetObservedCount() + { + var counts = new QuotationMarkCounts(); + ; + Assert.That(counts.TotalCount, Is.EqualTo(0)); + + counts.CountQuotationMark("\""); + Assert.That(counts.TotalCount, Is.EqualTo(1)); + + counts.CountQuotationMark("\""); + Assert.That(counts.TotalCount, Is.EqualTo(2)); + + counts.CountQuotationMark("'"); + Assert.That(counts.TotalCount, Is.EqualTo(3)); + } + + [Test] + public void GetBestProportion() + { + var counts = new QuotationMarkCounts(); + counts.CountQuotationMark("\""); + counts.CountQuotationMark("\""); + counts.CountQuotationMark("'"); + + (string bestStr, int bestCount, int totalCount) = counts.FindBestQuotationMarkProportion(); + Assert.That(bestStr, Is.EqualTo("\"")); + Assert.That(bestCount, Is.EqualTo(2)); + Assert.That(totalCount, Is.EqualTo(3)); + + counts.CountQuotationMark("'"); + counts.CountQuotationMark("'"); + + (bestStr, bestCount, totalCount) = counts.FindBestQuotationMarkProportion(); + Assert.That(bestStr, Is.EqualTo("'")); + Assert.That(bestCount, Is.EqualTo(3)); + Assert.That(totalCount, Is.EqualTo(5)); + } + + [Test] + public void CalculateNumDifferences() + { + var counts = new QuotationMarkCounts(); + counts.CountQuotationMark("\""); + counts.CountQuotationMark("\""); + counts.CountQuotationMark("'"); + + Assert.That(counts.CalculateNumDifferences("\""), Is.EqualTo(1)); + Assert.That(counts.CalculateNumDifferences("'"), Is.EqualTo(2)); + Assert.That(counts.CalculateNumDifferences("\u201c"), Is.EqualTo(3)); + + counts.CountQuotationMark("'"); + Assert.That(counts.CalculateNumDifferences("\""), Is.EqualTo(2)); + Assert.That(counts.CalculateNumDifferences("'"), Is.EqualTo(2)); + Assert.That(counts.CalculateNumDifferences("\u201c"), Is.EqualTo(4)); + } + + [Test] + public void CalculateSimilarity() + { + var singleLevelQuotationMarkTabulator = new QuotationMarkTabulator(); + singleLevelQuotationMarkTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u201d", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 0, + 1 + ), + ] + ); + + Assert.That( + singleLevelQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention("", [new SingleLevelQuoteConvention("\u201c", "\u201d")]) + ), + Is.EqualTo(1.0) + ); + ; + Assert.That( + singleLevelQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention("", [new SingleLevelQuoteConvention("\u201d", "\u201c")]) + ), + Is.EqualTo(0.0) + ); + ; + Assert.That( + singleLevelQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention("", [new SingleLevelQuoteConvention("\u201c", "\"")]) + ), + Is.EqualTo(0.5) + ); + ; + Assert.That( + singleLevelQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention( + "", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb") + ] + ) + ), + Is.EqualTo(1.0) + ); + + var emptyQuotationMarkTabulator = new QuotationMarkTabulator(); + Assert.That( + emptyQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention("", [new SingleLevelQuoteConvention("\u201c", "\u201d")]) + ), + Is.EqualTo(0.0) + ); + var twoLevelQuotationMarkTabulator = new QuotationMarkTabulator(); + twoLevelQuotationMarkTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u201d", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u2018", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 2 + ), + new QuotationMarkMetadata( + "\u2019", + 2, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 0, + 2 + ), + ] + ); + Assert.That( + twoLevelQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention("", [new SingleLevelQuoteConvention("\u201c", "\u201d")]) + ), + Is.EqualTo(0.66666666666667).Within(1e-9) + ); + Assert.That( + twoLevelQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention( + "", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019") + ] + ) + ), + Is.EqualTo(1.0) + ); + Assert.That( + twoLevelQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention( + "", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb") + ] + ) + ), + Is.EqualTo(0.66666666666667).Within(1e-9) + ); + Assert.That( + twoLevelQuotationMarkTabulator.CalculateSimilarity( + new QuoteConvention( + "", + [ + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u2018", "\u2019") + ] + ) + ), + Is.EqualTo(0.33333333333333).Within(1e-9) + ); + } +} diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionSetTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionSetTests.cs new file mode 100644 index 00000000..305282c6 --- /dev/null +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionSetTests.cs @@ -0,0 +1,1923 @@ +using System.Diagnostics.CodeAnalysis; +using System.Text.RegularExpressions; +using NUnit.Framework; + +namespace SIL.Machine.PunctuationAnalysis; + +[TestFixture] +public class QuoteConventionSetTests +{ + [Test] + public void QuoteRegexes() + { + var emptyQuoteConventionSet = new QuoteConventionSet([]); + Assert.That( + emptyQuoteConventionSet.OpeningQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"", RegexOptions.Compiled).ToString()) + ); + Assert.That( + emptyQuoteConventionSet.ClosingQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"", RegexOptions.Compiled).ToString()) + ); + Assert.That( + emptyQuoteConventionSet.AllQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"", RegexOptions.Compiled).ToString()) + ); + + var quoteConventionSetWithEmptyConventions = new QuoteConventionSet( + [new QuoteConvention("empty convention 1", []), new QuoteConvention("empty convention 2", [])] + ); + Assert.That( + quoteConventionSetWithEmptyConventions.OpeningQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"", RegexOptions.Compiled).ToString()) + ); + Assert.That( + quoteConventionSetWithEmptyConventions.ClosingQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"", RegexOptions.Compiled).ToString()) + ); + Assert.That( + quoteConventionSetWithEmptyConventions.AllQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"", RegexOptions.Compiled).ToString()) + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet( + [ + new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ) + ] + ); + Assert.That( + standardEnglishQuoteConventionSet.OpeningQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[‘“]", RegexOptions.Compiled).ToString()) + ); + Assert.That( + standardEnglishQuoteConventionSet.ClosingQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[’”]", RegexOptions.Compiled).ToString()) + ); + Assert.That( + standardEnglishQuoteConventionSet.AllQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[‘’“”]", RegexOptions.Compiled).ToString()) + ); + + var westernEuropeanQuoteConventionSet = new QuoteConventionSet( + [ + new QuoteConvention( + "western_european", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ), + ] + ); + Assert.That( + westernEuropeanQuoteConventionSet.OpeningQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[‘“«]", RegexOptions.Compiled).ToString()) + ); + Assert.That( + westernEuropeanQuoteConventionSet.ClosingQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[’”»]", RegexOptions.Compiled).ToString()) + ); + Assert.That( + westernEuropeanQuoteConventionSet.AllQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[‘’“”«»]", RegexOptions.Compiled).ToString()) + ); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [ + new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ), + new QuoteConvention( + "typewriter_french", + [ + new SingleLevelQuoteConvention("<<", ">>"), + new SingleLevelQuoteConvention("<", ">"), + new SingleLevelQuoteConvention("<<", ">>"), + new SingleLevelQuoteConvention("<", ">"), + ] + ), + new QuoteConvention( + "standard_french", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + ] + ), + ] + ); + Assert.That( + multipleQuoteConventionSet.OpeningQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[‘‹“«<<<]", RegexOptions.Compiled).ToString()) + ); + Assert.That( + multipleQuoteConventionSet.ClosingQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[’›”»>>>]", RegexOptions.Compiled).ToString()) + ); + Assert.That( + multipleQuoteConventionSet.AllQuotationMarkRegex.ToString(), + Is.EqualTo(new Regex(@"[‘’‹›“”«»<<<>>>]", RegexOptions.Compiled).ToString()) + ); + } + + [Test] + public void QuotationMarkPairMap() + { + var emptyQuoteConventionSet = new QuoteConventionSet([]); + Assert.That(emptyQuoteConventionSet.OpeningMarksByClosingMark, Has.Count.EqualTo(0)); + Assert.That(emptyQuoteConventionSet.ClosingMarksByOpeningMark, Has.Count.EqualTo(0)); + + var quoteConventionSetWithEmptyConventions = new QuoteConventionSet( + [new QuoteConvention("empty convention 1", []), new QuoteConvention("empty convention 2", [])] + ); + Assert.That(quoteConventionSetWithEmptyConventions.OpeningMarksByClosingMark, Has.Count.EqualTo(0)); + Assert.That(quoteConventionSetWithEmptyConventions.ClosingMarksByOpeningMark, Has.Count.EqualTo(0)); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet( + [ + new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ) + ] + ); + Assert.That( + standardEnglishQuoteConventionSet + .OpeningMarksByClosingMark.OrderBy(kvp => kvp.Key) + .SequenceEqual( + new Dictionary> { { "’", ["‘"] }, { "”", ["“"] } }.OrderBy(kvp => kvp.Key), + new QuotationMarkPairMapEqualityComparer() + ) + ); + Assert.That( + standardEnglishQuoteConventionSet + .ClosingMarksByOpeningMark.OrderBy(kvp => kvp.Key) + .SequenceEqual( + new Dictionary> { { "‘", ["’"] }, { "“", ["”"] } }.OrderBy(kvp => kvp.Key), + new QuotationMarkPairMapEqualityComparer() + ) + ); + + var westernEuropeanQuoteConventionSet = new QuoteConventionSet( + [ + new QuoteConvention( + "western_european", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ), + ] + ); + Assert.That( + westernEuropeanQuoteConventionSet + .OpeningMarksByClosingMark.OrderBy(kvp => kvp.Key) + .SequenceEqual( + new Dictionary> + { + { "’", ["‘"] }, + { "”", ["“"] }, + { "»", ["«"] } + }.OrderBy(kvp => kvp.Key), + new QuotationMarkPairMapEqualityComparer() + ) + ); + Assert.That( + westernEuropeanQuoteConventionSet + .ClosingMarksByOpeningMark.OrderBy(kvp => kvp.Key) + .SequenceEqual( + new Dictionary> + { + { "‘", ["’"] }, + { "“", ["”"] }, + { "«", ["»"] } + }.OrderBy(kvp => kvp.Key), + new QuotationMarkPairMapEqualityComparer() + ) + ); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [ + new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ), + new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ), + new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ), + ] + ); + Assert.That( + multipleQuoteConventionSet + .ClosingMarksByOpeningMark.OrderBy(kvp => kvp.Key) + .SequenceEqual( + new Dictionary> + { + { "‘", ["’"] }, + { "“", ["”"] }, + { "„", ["“"] }, + { "‚", ["‘"] }, + { "”", ["”"] }, + { "’", ["’"] }, + }.OrderBy(kvp => kvp.Key), + new QuotationMarkPairMapEqualityComparer() + ) + ); + Assert.That( + multipleQuoteConventionSet + .OpeningMarksByClosingMark.OrderBy(kvp => kvp.Key) + .SequenceEqual( + new Dictionary> + { + { "’", ["‘", "’"] }, + { "”", ["“", "”"] }, + { "“", ["„"] }, + { "‘", ["‚"] }, + }.OrderBy(kvp => kvp.Key), + new QuotationMarkPairMapEqualityComparer() + ) + ); + } + + [Test] + public void GetQuoteConventionByName() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var centralEuropeanQuoteConvention = new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + var multipleQuoteConventionSet = new QuoteConventionSet( + [standardEnglishQuoteConvention, centralEuropeanQuoteConvention, standardSwedishQuoteConvention] + ); + + Assert.That( + multipleQuoteConventionSet.GetQuoteConventionByName("standard_english"), + Is.EqualTo(standardEnglishQuoteConvention) + ); + Assert.That( + multipleQuoteConventionSet.GetQuoteConventionByName("central_european"), + Is.EqualTo(centralEuropeanQuoteConvention) + ); + Assert.That( + multipleQuoteConventionSet.GetQuoteConventionByName("standard_swedish"), + Is.EqualTo(standardSwedishQuoteConvention) + ); + Assert.IsNull(multipleQuoteConventionSet.GetQuoteConventionByName("undefined convention")); + } + + [Test] + public void GetAllQuoteConventionNames() + { + Assert.That(new QuoteConventionSet([]).GetAllQuoteConventionNames(), Has.Count.EqualTo(0)); + Assert.That( + new QuoteConventionSet([new QuoteConvention("conv", [])]) + .GetAllQuoteConventionNames() + .SequenceEqual(["conv"]) + ); + Assert.That( + new QuoteConventionSet([new QuoteConvention("conv1", []), new QuoteConvention("conv2", [])]) + .GetAllQuoteConventionNames() + .SequenceEqual(["conv1", "conv2"]) + ); + Assert.That( + new QuoteConventionSet([new QuoteConvention("conv2", []), new QuoteConvention("conv1", [])]) + .GetAllQuoteConventionNames() + .SequenceEqual(["conv1", "conv2"]) + ); + } + + [Test] + public void GetPossibleOpeningQuotationMarks() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var centralEuropeanQuoteConvention = new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.That(standardEnglishQuoteConventionSet.GetPossibleOpeningQuotationMarks().SequenceEqual(["‘", "“"])); + + var centralEuropeanQuoteConventionSet = new QuoteConventionSet([centralEuropeanQuoteConvention]); + Assert.That(centralEuropeanQuoteConventionSet.GetPossibleOpeningQuotationMarks().SequenceEqual(["‚", "„"])); + + var standardSwedishQuoteConventionSet = new QuoteConventionSet([standardSwedishQuoteConvention]); + Assert.That(standardSwedishQuoteConventionSet.GetPossibleOpeningQuotationMarks().SequenceEqual(["’", "”"])); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [standardEnglishQuoteConvention, centralEuropeanQuoteConvention, standardSwedishQuoteConvention] + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleOpeningQuotationMarks().SequenceEqual(["‘", "’", "‚", "“", "”", "„"]) + ); + } + + [Test] + public void GetPossibleClosingQuotationMarks() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + var centralEuropeanQuoteConvention = new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.That(standardEnglishQuoteConventionSet.GetPossibleClosingQuotationMarks().SequenceEqual(["’", "”"])); + + var centralEuropeanQuoteConventionSet = new QuoteConventionSet([centralEuropeanQuoteConvention]); + Assert.That(centralEuropeanQuoteConventionSet.GetPossibleClosingQuotationMarks().SequenceEqual(["‘", "“"])); + + var standardSwedishQuoteConventionSet = new QuoteConventionSet([standardSwedishQuoteConvention]); + Assert.That(standardSwedishQuoteConventionSet.GetPossibleClosingQuotationMarks().SequenceEqual(["’", "”"])); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [standardEnglishQuoteConvention, centralEuropeanQuoteConvention, standardSwedishQuoteConvention] + ); + Assert.That(multipleQuoteConventionSet.GetPossibleClosingQuotationMarks().SequenceEqual(["‘", "’", "“", "”"])); + } + + [Test] + public void IsOpeningQuotationMark() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var centralEuropeanQuoteConvention = new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var standardFrenchQuoteConvention = new QuoteConvention( + "standard_french", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.IsTrue(standardEnglishQuoteConventionSet.IsValidOpeningQuotationMark("‘")); + Assert.IsTrue(standardEnglishQuoteConventionSet.IsValidOpeningQuotationMark("“")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsValidOpeningQuotationMark("”")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsValidOpeningQuotationMark("’")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsValidOpeningQuotationMark("")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsValidOpeningQuotationMark("‘“")); + + var centralEuropeanQuoteConventionSet = new QuoteConventionSet([centralEuropeanQuoteConvention]); + Assert.IsTrue(centralEuropeanQuoteConventionSet.IsValidOpeningQuotationMark("‚")); + Assert.IsTrue(centralEuropeanQuoteConventionSet.IsValidOpeningQuotationMark("„")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.IsValidOpeningQuotationMark("‘")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.IsValidOpeningQuotationMark("“")); + + var standardSwedishQuoteConventionSet = new QuoteConventionSet([standardSwedishQuoteConvention]); + Assert.IsTrue(standardSwedishQuoteConventionSet.IsValidOpeningQuotationMark("’")); + Assert.IsTrue(standardSwedishQuoteConventionSet.IsValidOpeningQuotationMark("”")); + + var standardFrenchQuoteConventionSet = new QuoteConventionSet([standardFrenchQuoteConvention]); + Assert.IsTrue(standardFrenchQuoteConventionSet.IsValidOpeningQuotationMark("«")); + Assert.IsTrue(standardFrenchQuoteConventionSet.IsValidOpeningQuotationMark("‹")); + Assert.IsFalse(standardFrenchQuoteConventionSet.IsValidOpeningQuotationMark("»")); + Assert.IsFalse(standardFrenchQuoteConventionSet.IsValidOpeningQuotationMark("›")); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [ + standardEnglishQuoteConvention, + centralEuropeanQuoteConvention, + standardSwedishQuoteConvention, + standardFrenchQuoteConvention, + ] + ); + Assert.That( + multipleQuoteConventionSet + .GetPossibleOpeningQuotationMarks() + .SequenceEqual(["‘", "’", "‚", "‹", "“", "”", "„", "«"]) + ); + Assert.IsTrue(multipleQuoteConventionSet.IsValidOpeningQuotationMark("‘")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidOpeningQuotationMark("’")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidOpeningQuotationMark("‚")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidOpeningQuotationMark("“")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidOpeningQuotationMark("”")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidOpeningQuotationMark("„")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidOpeningQuotationMark("«")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidOpeningQuotationMark("‹")); + Assert.IsFalse(multipleQuoteConventionSet.IsValidOpeningQuotationMark("»")); + Assert.IsFalse(multipleQuoteConventionSet.IsValidOpeningQuotationMark("›")); + } + + [Test] + public void IsClosingQuotationMark() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var centralEuropeanQuoteConvention = new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var standardFrenchQuoteConvention = new QuoteConvention( + "standard_french", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.IsTrue(standardEnglishQuoteConventionSet.IsValidClosingQuotationMark("”")); + Assert.IsTrue(standardEnglishQuoteConventionSet.IsValidClosingQuotationMark("’")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsValidClosingQuotationMark("‘")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsValidClosingQuotationMark("“")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsValidClosingQuotationMark("")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsValidClosingQuotationMark("”’")); + + var centralEuropeanQuoteConventionSet = new QuoteConventionSet([centralEuropeanQuoteConvention]); + Assert.IsTrue(centralEuropeanQuoteConventionSet.IsValidClosingQuotationMark("‘")); + Assert.IsTrue(centralEuropeanQuoteConventionSet.IsValidClosingQuotationMark("“")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.IsValidClosingQuotationMark("„")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.IsValidClosingQuotationMark("‚")); + + var standardSwedishQuoteConventionSet = new QuoteConventionSet([standardSwedishQuoteConvention]); + Assert.IsTrue(standardSwedishQuoteConventionSet.IsValidClosingQuotationMark("’")); + Assert.IsTrue(standardSwedishQuoteConventionSet.IsValidClosingQuotationMark("”")); + + var standardFrenchQuoteConventionSet = new QuoteConventionSet([standardFrenchQuoteConvention]); + Assert.IsTrue(standardFrenchQuoteConventionSet.IsValidClosingQuotationMark("»")); + Assert.IsTrue(standardFrenchQuoteConventionSet.IsValidClosingQuotationMark("›")); + Assert.IsFalse(standardFrenchQuoteConventionSet.IsValidClosingQuotationMark("«")); + Assert.IsFalse(standardFrenchQuoteConventionSet.IsValidClosingQuotationMark("‹")); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [ + standardEnglishQuoteConvention, + centralEuropeanQuoteConvention, + standardSwedishQuoteConvention, + standardFrenchQuoteConvention, + ] + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleClosingQuotationMarks().SequenceEqual(["‘", "’", "›", "“", "”", "»"]) + ); + Assert.IsTrue(multipleQuoteConventionSet.IsValidClosingQuotationMark("‘")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidClosingQuotationMark("’")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidClosingQuotationMark("“")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidClosingQuotationMark("”")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidClosingQuotationMark("»")); + Assert.IsTrue(multipleQuoteConventionSet.IsValidClosingQuotationMark("›")); + Assert.IsFalse(multipleQuoteConventionSet.IsValidClosingQuotationMark("«")); + Assert.IsFalse(multipleQuoteConventionSet.IsValidClosingQuotationMark("‹")); + } + + [Test] + public void AreMarksAValidPair() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var centralEuropeanQuoteConvention = new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var standardFrenchQuoteConvention = new QuoteConvention( + "standard_french", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.IsTrue(standardEnglishQuoteConventionSet.MarksAreAValidPair("“", "”")); + Assert.IsFalse(standardEnglishQuoteConventionSet.MarksAreAValidPair("”", "“")); + Assert.IsTrue(standardEnglishQuoteConventionSet.MarksAreAValidPair("‘", "’")); + Assert.IsFalse(standardEnglishQuoteConventionSet.MarksAreAValidPair("’", "‘")); + Assert.IsFalse(standardEnglishQuoteConventionSet.MarksAreAValidPair("‘", "”")); + Assert.IsFalse(standardEnglishQuoteConventionSet.MarksAreAValidPair("‘", "”")); + Assert.IsFalse(standardEnglishQuoteConventionSet.MarksAreAValidPair("‘", "")); + Assert.IsFalse(standardEnglishQuoteConventionSet.MarksAreAValidPair("", "")); + + var centralEuropeanQuoteConventionSet = new QuoteConventionSet([centralEuropeanQuoteConvention]); + Assert.IsTrue(centralEuropeanQuoteConventionSet.MarksAreAValidPair("„", "“")); + Assert.IsTrue(centralEuropeanQuoteConventionSet.MarksAreAValidPair("‚", "‘")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.MarksAreAValidPair("“", "„")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.MarksAreAValidPair("’", "‚")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.MarksAreAValidPair("‚", "“")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.MarksAreAValidPair("‚", "’")); + + var standardSwedishQuoteConventionSet = new QuoteConventionSet([standardSwedishQuoteConvention]); + Assert.IsTrue(standardSwedishQuoteConventionSet.MarksAreAValidPair("”", "”")); + Assert.IsTrue(standardSwedishQuoteConventionSet.MarksAreAValidPair("’", "’")); + Assert.IsFalse(standardSwedishQuoteConventionSet.MarksAreAValidPair("”", "’")); + Assert.IsFalse(standardSwedishQuoteConventionSet.MarksAreAValidPair("’", "”")); + + var standardFrenchQuoteConventionSet = new QuoteConventionSet([standardFrenchQuoteConvention]); + Assert.IsTrue(standardFrenchQuoteConventionSet.MarksAreAValidPair("«", "»")); + Assert.IsTrue(standardFrenchQuoteConventionSet.MarksAreAValidPair("‹", "›")); + Assert.IsFalse(standardFrenchQuoteConventionSet.MarksAreAValidPair("«", "›")); + Assert.IsFalse(standardFrenchQuoteConventionSet.MarksAreAValidPair("‹", "»")); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [ + standardEnglishQuoteConvention, + centralEuropeanQuoteConvention, + standardSwedishQuoteConvention, + standardFrenchQuoteConvention, + ] + ); + Assert.IsTrue(multipleQuoteConventionSet.MarksAreAValidPair("“", "”")); + Assert.IsTrue(multipleQuoteConventionSet.MarksAreAValidPair("‘", "’")); + Assert.IsTrue(multipleQuoteConventionSet.MarksAreAValidPair("„", "“")); + Assert.IsTrue(multipleQuoteConventionSet.MarksAreAValidPair("‚", "‘")); + Assert.IsTrue(multipleQuoteConventionSet.MarksAreAValidPair("”", "”")); + Assert.IsTrue(multipleQuoteConventionSet.MarksAreAValidPair("’", "’")); + Assert.IsTrue(multipleQuoteConventionSet.MarksAreAValidPair("«", "»")); + Assert.IsTrue(multipleQuoteConventionSet.MarksAreAValidPair("‹", "›")); + Assert.IsFalse(multipleQuoteConventionSet.MarksAreAValidPair("‹", "»")); + Assert.IsFalse(multipleQuoteConventionSet.MarksAreAValidPair("‹", "”")); + Assert.IsFalse(multipleQuoteConventionSet.MarksAreAValidPair("„", "”")); + Assert.IsFalse(multipleQuoteConventionSet.MarksAreAValidPair("’", "‘")); + } + + [Test] + public void IsQuotationMarkDirectionAmbiguous() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var typewriterEnglishQuoteConvention = new QuoteConvention( + "typewriter_english", + [ + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + ] + ); + + var centralEuropeanQuoteConvention = new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var easternEuropeanQuoteConvention = new QuoteConvention( + "eastern_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201d"), + new SingleLevelQuoteConvention("\u201a", "\u2019"), + new SingleLevelQuoteConvention("\u201e", "\u201d"), + new SingleLevelQuoteConvention("\u201a", "\u2019"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("“")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("”")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("‘")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("’")); + Assert.IsFalse(standardEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("\"")); + + var typewriterEnglishQuoteConventionSet = new QuoteConventionSet([typewriterEnglishQuoteConvention]); + Assert.IsTrue(typewriterEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("\"")); + Assert.IsTrue(typewriterEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("'")); + Assert.IsFalse(typewriterEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("‘")); + Assert.IsFalse(typewriterEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("’")); + Assert.IsFalse(typewriterEnglishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("«")); + + var centralEuropeanQuoteConventionSet = new QuoteConventionSet([centralEuropeanQuoteConvention]); + Assert.IsFalse(centralEuropeanQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("“")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("„")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("‘")); + Assert.IsFalse(centralEuropeanQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("‚")); + + var standardSwedishQuoteConventionSet = new QuoteConventionSet([standardSwedishQuoteConvention]); + Assert.IsTrue(standardSwedishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("”")); + Assert.IsTrue(standardSwedishQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("’")); + + var easternEuropeanQuoteConventionSet = new QuoteConventionSet([easternEuropeanQuoteConvention]); + Assert.IsFalse(easternEuropeanQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("”")); + Assert.IsFalse(easternEuropeanQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("„")); + Assert.IsFalse(easternEuropeanQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("’")); + Assert.IsFalse(easternEuropeanQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("‚")); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [ + standardEnglishQuoteConvention, + typewriterEnglishQuoteConvention, + centralEuropeanQuoteConvention, + standardSwedishQuoteConvention, + easternEuropeanQuoteConvention, + ] + ); + Assert.IsTrue(multipleQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("\"")); + Assert.IsTrue(multipleQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("'")); + Assert.IsTrue(multipleQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("”")); + Assert.IsTrue(multipleQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("’")); + Assert.IsFalse(multipleQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("„")); + Assert.IsFalse(multipleQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("‚")); + + // these are unambiguous because they are never the opening and closing in the same convention + Assert.IsFalse(multipleQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("“")); + Assert.IsFalse(multipleQuoteConventionSet.IsQuotationMarkDirectionAmbiguous("‘")); + } + + [Test] + public void GetPossiblePairedQuotationMarks() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var centralEuropeanQuoteConvention = new QuoteConvention( + "central_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + new SingleLevelQuoteConvention("\u201e", "\u201c"), + new SingleLevelQuoteConvention("\u201a", "\u2018"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var easternEuropeanQuoteConvention = new QuoteConvention( + "eastern_european", + [ + new SingleLevelQuoteConvention("\u201e", "\u201d"), + new SingleLevelQuoteConvention("\u201a", "\u2019"), + new SingleLevelQuoteConvention("\u201e", "\u201d"), + new SingleLevelQuoteConvention("\u201a", "\u2019"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.That(standardEnglishQuoteConventionSet.GetPossiblePairedQuotationMarks("“").SequenceEqual(["”"])); + Assert.That(standardEnglishQuoteConventionSet.GetPossiblePairedQuotationMarks("”").SequenceEqual(["“"])); + Assert.That(standardEnglishQuoteConventionSet.GetPossiblePairedQuotationMarks("‘").SequenceEqual(["’"])); + Assert.That(standardEnglishQuoteConventionSet.GetPossiblePairedQuotationMarks("’").SequenceEqual(["‘"])); + + var centralEuropeanQuoteConventionSet = new QuoteConventionSet([centralEuropeanQuoteConvention]); + Assert.That(centralEuropeanQuoteConventionSet.GetPossiblePairedQuotationMarks("„").SequenceEqual(["“"])); + Assert.That(centralEuropeanQuoteConventionSet.GetPossiblePairedQuotationMarks("“").SequenceEqual(["„"])); + Assert.That(centralEuropeanQuoteConventionSet.GetPossiblePairedQuotationMarks("‚").SequenceEqual(["‘"])); + Assert.That(centralEuropeanQuoteConventionSet.GetPossiblePairedQuotationMarks("‘").SequenceEqual(["‚"])); + + var standardSwedishQuoteConventionSet = new QuoteConventionSet([standardSwedishQuoteConvention]); + Assert.That(standardSwedishQuoteConventionSet.GetPossiblePairedQuotationMarks("”").SequenceEqual(["”"])); + Assert.That(standardSwedishQuoteConventionSet.GetPossiblePairedQuotationMarks("’").SequenceEqual(["’"])); + + var easternEuropeanQuoteConventionSet = new QuoteConventionSet([easternEuropeanQuoteConvention]); + Assert.That(easternEuropeanQuoteConventionSet.GetPossiblePairedQuotationMarks("„").SequenceEqual(["”"])); + Assert.That(easternEuropeanQuoteConventionSet.GetPossiblePairedQuotationMarks("”").SequenceEqual(["„"])); + Assert.That(easternEuropeanQuoteConventionSet.GetPossiblePairedQuotationMarks("‚").SequenceEqual(["’"])); + Assert.That(easternEuropeanQuoteConventionSet.GetPossiblePairedQuotationMarks("’").SequenceEqual(["‚"])); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [ + standardEnglishQuoteConvention, + centralEuropeanQuoteConvention, + standardSwedishQuoteConvention, + easternEuropeanQuoteConvention, + ] + ); + Assert.That(multipleQuoteConventionSet.GetPossiblePairedQuotationMarks("“").SequenceEqual(["”", "„"])); + Assert.That(multipleQuoteConventionSet.GetPossiblePairedQuotationMarks("”").SequenceEqual(["”", "“", "„"])); + Assert.That(multipleQuoteConventionSet.GetPossiblePairedQuotationMarks("‘").SequenceEqual(["’", "‚"])); + Assert.That(multipleQuoteConventionSet.GetPossiblePairedQuotationMarks("’").SequenceEqual(["’", "‘", "‚"])); + Assert.That(multipleQuoteConventionSet.GetPossiblePairedQuotationMarks("„").SequenceEqual(["“", "”"])); + Assert.That(multipleQuoteConventionSet.GetPossiblePairedQuotationMarks("‚").SequenceEqual(["‘", "’"])); + } + + [Test] + public void GetPossibleDepths() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var britishEnglishQuoteConvention = new QuoteConvention( + "british_english", + [ + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + ] + ); + + var normalizedWesternEuropeanQuoteConvention = new QuoteConvention( + "westernEuropeanNormalized", + [ + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("\"", "\""), + new SingleLevelQuoteConvention("'", "'"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.That( + standardEnglishQuoteConventionSet + .GetPossibleDepths("\u201c", QuotationMarkDirection.Opening) + .SequenceEqual([1, 3]) + ); + Assert.That( + standardEnglishQuoteConventionSet.GetPossibleDepths("\u201c", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet + .GetPossibleDepths("\u201d", QuotationMarkDirection.Closing) + .SequenceEqual([1, 3]) + ); + Assert.That( + standardEnglishQuoteConventionSet.GetPossibleDepths("\u201d", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet + .GetPossibleDepths("\u2018", QuotationMarkDirection.Opening) + .SequenceEqual([2, 4]) + ); + Assert.That( + standardEnglishQuoteConventionSet.GetPossibleDepths("\u2018", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet + .GetPossibleDepths("\u2019", QuotationMarkDirection.Closing) + .SequenceEqual([2, 4]) + ); + Assert.That( + standardEnglishQuoteConventionSet.GetPossibleDepths("\u2019", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet.GetPossibleDepths("\u201e", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet.GetPossibleDepths("\u201e", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet.GetPossibleDepths("\"", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet.GetPossibleDepths("\"", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + + var britishEnglishQuoteConventionSet = new QuoteConventionSet([britishEnglishQuoteConvention]); + Assert.That( + britishEnglishQuoteConventionSet + .GetPossibleDepths("\u2018", QuotationMarkDirection.Opening) + .SequenceEqual([1, 3]) + ); + Assert.That( + britishEnglishQuoteConventionSet.GetPossibleDepths("\u2018", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + britishEnglishQuoteConventionSet + .GetPossibleDepths("\u2019", QuotationMarkDirection.Closing) + .SequenceEqual([1, 3]) + ); + Assert.That( + britishEnglishQuoteConventionSet.GetPossibleDepths("\u2019", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + britishEnglishQuoteConventionSet + .GetPossibleDepths("\u201c", QuotationMarkDirection.Opening) + .SequenceEqual([2, 4]) + ); + Assert.That( + britishEnglishQuoteConventionSet.GetPossibleDepths("\u201c", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + britishEnglishQuoteConventionSet + .GetPossibleDepths("\u201d", QuotationMarkDirection.Closing) + .SequenceEqual([2, 4]) + ); + Assert.That( + britishEnglishQuoteConventionSet.GetPossibleDepths("\u201d", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + britishEnglishQuoteConventionSet.GetPossibleDepths("\u201e", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + britishEnglishQuoteConventionSet.GetPossibleDepths("\u201e", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + britishEnglishQuoteConventionSet.GetPossibleDepths("'", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + britishEnglishQuoteConventionSet.GetPossibleDepths("'", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + + var normalizedWesternEuropeanQuoteConventionSet = new QuoteConventionSet( + [normalizedWesternEuropeanQuoteConvention] + ); + Assert.That( + normalizedWesternEuropeanQuoteConventionSet + .GetPossibleDepths("\"", QuotationMarkDirection.Opening) + .SequenceEqual([1, 2]) + ); + Assert.That( + normalizedWesternEuropeanQuoteConventionSet + .GetPossibleDepths("\"", QuotationMarkDirection.Closing) + .SequenceEqual([1, 2]) + ); + Assert.That( + normalizedWesternEuropeanQuoteConventionSet + .GetPossibleDepths("'", QuotationMarkDirection.Opening) + .SequenceEqual([3]) + ); + Assert.That( + normalizedWesternEuropeanQuoteConventionSet + .GetPossibleDepths("'", QuotationMarkDirection.Closing) + .SequenceEqual([3]) + ); + Assert.That( + normalizedWesternEuropeanQuoteConventionSet.GetPossibleDepths("\u201c", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + normalizedWesternEuropeanQuoteConventionSet.GetPossibleDepths("\u201c", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [standardEnglishQuoteConvention, britishEnglishQuoteConvention, normalizedWesternEuropeanQuoteConvention,] + ); + Assert.That( + multipleQuoteConventionSet + .GetPossibleDepths("\u201c", QuotationMarkDirection.Opening) + .OrderBy(d => d) + .SequenceEqual([1, 2, 3, 4]) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("\u201c", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + multipleQuoteConventionSet + .GetPossibleDepths("\u201d", QuotationMarkDirection.Closing) + .OrderBy(d => d) + .SequenceEqual([1, 2, 3, 4]) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("\u201d", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + multipleQuoteConventionSet + .GetPossibleDepths("\u2018", QuotationMarkDirection.Opening) + .OrderBy(d => d) + .SequenceEqual([1, 2, 3, 4]) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("\u2018", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + multipleQuoteConventionSet + .GetPossibleDepths("\u2019", QuotationMarkDirection.Closing) + .OrderBy(d => d) + .SequenceEqual([1, 2, 3, 4]) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("\u2019", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("\u201e", QuotationMarkDirection.Opening), + Has.Count.EqualTo(0) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("\u201e", QuotationMarkDirection.Closing), + Has.Count.EqualTo(0) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("\"", QuotationMarkDirection.Opening).SequenceEqual([1, 2]) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("\"", QuotationMarkDirection.Closing).SequenceEqual([1, 2]) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("'", QuotationMarkDirection.Opening).SequenceEqual([3]) + ); + Assert.That( + multipleQuoteConventionSet.GetPossibleDepths("'", QuotationMarkDirection.Closing).SequenceEqual([3]) + ); + } + + [Test] + public void DoesMetadataMatchQuotationMark() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.IsTrue( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201c", 1, QuotationMarkDirection.Opening) + ); + Assert.IsTrue( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201c", 3, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201c", 2, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201c", 4, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201c", 1, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201c", 2, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201c", 3, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201c", 4, QuotationMarkDirection.Closing) + ); + Assert.IsTrue( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201d", 1, QuotationMarkDirection.Closing) + ); + Assert.IsTrue( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201d", 3, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201d", 2, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201d", 4, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201d", 1, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201d", 2, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201d", 3, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201d", 4, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2018", 1, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2018", 3, QuotationMarkDirection.Opening) + ); + Assert.IsTrue( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2018", 2, QuotationMarkDirection.Opening) + ); + Assert.IsTrue( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2018", 4, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2018", 1, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2018", 2, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2018", 3, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2018", 4, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2019", 1, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2019", 3, QuotationMarkDirection.Closing) + ); + Assert.IsTrue( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2019", 2, QuotationMarkDirection.Closing) + ); + Assert.IsTrue( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2019", 4, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2019", 1, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2019", 2, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2019", 3, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u2019", 4, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201e", 1, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201e", 1, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201e", 2, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201e", 2, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201e", 3, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201e", 3, QuotationMarkDirection.Closing) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201e", 4, QuotationMarkDirection.Opening) + ); + Assert.IsFalse( + standardEnglishQuoteConventionSet.MetadataMatchesQuotationMark("\u201e", 4, QuotationMarkDirection.Closing) + ); + } + + [Test] + public void FilterToCompatibleQuoteConventions() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var standardFrenchQuoteConvention = new QuoteConvention( + "standard_french", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + ] + ); + + var westernEuropeanQuoteConvention = new QuoteConvention( + "western_european", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var standardSwedishQuoteConvention = new QuoteConvention( + "standard_swedish", + [ + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + new SingleLevelQuoteConvention("\u201d", "\u201d"), + new SingleLevelQuoteConvention("\u2019", "\u2019"), + ] + ); + + var standardEnglishQuoteConventionSet = new QuoteConventionSet([standardEnglishQuoteConvention]); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c"], ["\u201d"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_english"]) + ); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c", "\u2018"], ["\u201d", "\u2019"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_english"]) + ); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c", "\u2018"], ["\u201d"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_english"]) + ); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c"], ["\u201d", "\u2019"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_english"]) + ); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u2018"], ["\u201d"]) + .GetAllQuoteConventionNames(), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c"], ["\u2019"]) + .GetAllQuoteConventionNames(), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201d"], ["\u201c"]) + .GetAllQuoteConventionNames(), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c", "\u201d"], ["\u201d"]) + .GetAllQuoteConventionNames(), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c", "\u201e"], ["\u201d"]) + .GetAllQuoteConventionNames(), + Has.Count.EqualTo(0) + ); + Assert.That( + standardEnglishQuoteConventionSet.FilterToCompatibleQuoteConventions([], []).GetAllQuoteConventionNames(), + Has.Count.EqualTo(0) + ); + + var multipleQuoteConventionSet = new QuoteConventionSet( + [ + standardEnglishQuoteConvention, + standardFrenchQuoteConvention, + westernEuropeanQuoteConvention, + standardSwedishQuoteConvention, + ] + ); + Assert.That( + multipleQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c"], ["\u201d"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_english"]) + ); + Assert.That( + multipleQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c", "\u2018"], ["\u201d", "\u2019"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_english"]) + ); + Assert.That( + multipleQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201d"], ["\u201d"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_swedish"]) + ); + Assert.That( + multipleQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u201c"], ["\u201c"]) + .GetAllQuoteConventionNames(), + Has.Count.EqualTo(0) + ); + Assert.That( + multipleQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u00ab"], ["\u00bb"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_french", "western_european"]) + ); + Assert.That( + multipleQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u00ab", "\u2039"], ["\u00bb"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["standard_french"]) + ); + Assert.That( + multipleQuoteConventionSet + .FilterToCompatibleQuoteConventions(["\u00ab"], ["\u00bb", "\u201d"]) + .GetAllQuoteConventionNames() + .SequenceEqual(["western_european"]) + ); + Assert.That( + multipleQuoteConventionSet.FilterToCompatibleQuoteConventions([], []).GetAllQuoteConventionNames(), + Has.Count.EqualTo(0) + ); + } + + [Test] + public void FindMostSimilarConvention() + { + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var standardFrenchQuoteConvention = new QuoteConvention( + "standard_french", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2039", "\u203a"), + ] + ); + + var westernEuropeanQuoteConvention = new QuoteConvention( + "western_european", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + + var allThreeQuoteConventionSet = new QuoteConventionSet( + [standardEnglishQuoteConvention, standardFrenchQuoteConvention, westernEuropeanQuoteConvention,] + ); + var twoFrenchQuoteConventionSet = new QuoteConventionSet( + [westernEuropeanQuoteConvention, standardFrenchQuoteConvention] + ); + + var multipleEnglishQuotesTabulator = new QuotationMarkTabulator(); + multipleEnglishQuotesTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u2018", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 5, + 6 + ), + new QuotationMarkMetadata( + "\u2019", + 2, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 13, + 14 + ), + new QuotationMarkMetadata( + "\u201d", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 14, + 15 + ), + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 28, + 29 + ), + new QuotationMarkMetadata( + "\u201d", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 42, + 43 + ), + ] + ); + Assert.That( + allThreeQuoteConventionSet.FindMostSimilarConvention(multipleEnglishQuotesTabulator), + Is.EqualTo((standardEnglishQuoteConvention, 1.0)) + ); + + var multipleWesternEuropeanQuotesTabulator = new QuotationMarkTabulator(); + multipleWesternEuropeanQuotesTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u00ab", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u201c", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 5, + 6 + ), + new QuotationMarkMetadata( + "\u201d", + 2, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 13, + 14 + ), + new QuotationMarkMetadata( + "\u00bb", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 14, + 15 + ), + new QuotationMarkMetadata( + "\u00ab", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 28, + 29 + ), + new QuotationMarkMetadata( + "\u00bb", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 42, + 43 + ), + ] + ); + Assert.That( + allThreeQuoteConventionSet.FindMostSimilarConvention(multipleWesternEuropeanQuotesTabulator), + Is.EqualTo((westernEuropeanQuoteConvention, 1.0)) + ); + + var multipleFrenchQuotesTabulator = new QuotationMarkTabulator(); + multipleFrenchQuotesTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u00ab", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u2039", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 5, + 6 + ), + new QuotationMarkMetadata( + "\u203a", + 2, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 13, + 14 + ), + new QuotationMarkMetadata( + "\u00bb", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 14, + 15 + ), + new QuotationMarkMetadata( + "\u00ab", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 28, + 29 + ), + new QuotationMarkMetadata( + "\u00bb", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 42, + 43 + ), + ] + ); + Assert.That( + allThreeQuoteConventionSet.FindMostSimilarConvention(multipleFrenchQuotesTabulator), + Is.EqualTo((standardFrenchQuoteConvention, 1.0)) + ); + Assert.That( + twoFrenchQuoteConventionSet.FindMostSimilarConvention(multipleFrenchQuotesTabulator), + Is.EqualTo((standardFrenchQuoteConvention, 1.0)) + ); + + var noisyMultipleEnglishQuotesTabulator = new QuotationMarkTabulator(); + noisyMultipleEnglishQuotesTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u201c", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 5, + 6 + ), + new QuotationMarkMetadata( + "\u2019", + 2, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 13, + 14 + ), + new QuotationMarkMetadata( + "\u201d", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 14, + 15 + ), + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 28, + 29 + ), + new QuotationMarkMetadata( + "\u201d", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 42, + 43 + ), + ] + ); + (QuoteConvention convention, double similarity) = allThreeQuoteConventionSet.FindMostSimilarConvention( + noisyMultipleEnglishQuotesTabulator + ); + Assert.That(convention, Is.EqualTo(standardEnglishQuoteConvention)); + Assert.That(similarity, Is.EqualTo(0.9).Within(1e-9)); + (convention, similarity) = twoFrenchQuoteConventionSet.FindMostSimilarConvention( + noisyMultipleEnglishQuotesTabulator + ); + Assert.That(convention, Is.EqualTo(westernEuropeanQuoteConvention)); + Assert.That(similarity, Is.EqualTo(0.1).Within(1e-9)); + + var noisyMultipleFrenchQuotesTabulator = new QuotationMarkTabulator(); + noisyMultipleFrenchQuotesTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u00ab", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u2039", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 5, + 6 + ), + new QuotationMarkMetadata( + "\u203a", + 2, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 13, + 14 + ), + new QuotationMarkMetadata( + "\u2039", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 5, + 6 + ), + new QuotationMarkMetadata( + "\u2019", + 2, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 13, + 14 + ), + new QuotationMarkMetadata( + "\u00bb", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 14, + 15 + ), + new QuotationMarkMetadata( + "\u00ab", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 28, + 29 + ), + new QuotationMarkMetadata( + "\u00bb", + 1, + QuotationMarkDirection.Closing, + new TextSegment.Builder().Build(), + 42, + 43 + ), + ] + ); + (convention, similarity) = allThreeQuoteConventionSet.FindMostSimilarConvention( + noisyMultipleFrenchQuotesTabulator + ); + Assert.That(convention, Is.EqualTo(standardFrenchQuoteConvention)); + Assert.That(similarity, Is.EqualTo(0.916666666666).Within(1e-9)); + + var tooDeepEnglishQuotesTabulator = new QuotationMarkTabulator(); + tooDeepEnglishQuotesTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ), + new QuotationMarkMetadata( + "\u2018", + 2, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 5, + 6 + ), + new QuotationMarkMetadata( + "\u201c", + 3, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 13, + 14 + ), + new QuotationMarkMetadata( + "\u2018", + 4, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 15, + 16 + ), + new QuotationMarkMetadata( + "\u201c", + 5, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 17, + 18 + ), + ] + ); + (convention, similarity) = allThreeQuoteConventionSet.FindMostSimilarConvention(tooDeepEnglishQuotesTabulator); + Assert.That(convention, Is.EqualTo(standardEnglishQuoteConvention)); + Assert.That(similarity, Is.EqualTo(0.967741935483871).Within(1e-9)); + + // in case of ties, the earlier convention in the list should be returned + var unknownQuoteTabulator = new QuotationMarkTabulator(); + unknownQuoteTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u201a", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ) + ] + ); + Assert.That( + allThreeQuoteConventionSet.FindMostSimilarConvention(unknownQuoteTabulator), + Is.EqualTo((standardEnglishQuoteConvention, 0.0)) + ); + + var singleFrenchOpeningQuoteTabulator = new QuotationMarkTabulator(); + singleFrenchOpeningQuoteTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u00ab", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ) + ] + ); + Assert.That( + allThreeQuoteConventionSet.FindMostSimilarConvention(singleFrenchOpeningQuoteTabulator), + Is.EqualTo((standardFrenchQuoteConvention, 1.0)) + ); + Assert.That( + twoFrenchQuoteConventionSet.FindMostSimilarConvention(singleFrenchOpeningQuoteTabulator), + Is.EqualTo((westernEuropeanQuoteConvention, 1.0)) + ); + + // Default values should be returned when the QuoteConventionSet is empty + var singleEnglishOpeningQuoteTabulator = new QuotationMarkTabulator(); + singleEnglishOpeningQuoteTabulator.Tabulate( + [ + new QuotationMarkMetadata( + "\u201c", + 1, + QuotationMarkDirection.Opening, + new TextSegment.Builder().Build(), + 0, + 1 + ) + ] + ); + var emptyQuoteConventionSet = new QuoteConventionSet([]); + Assert.That( + emptyQuoteConventionSet.FindMostSimilarConvention(singleEnglishOpeningQuoteTabulator), + Is.EqualTo(((QuoteConvention?)null, double.MinValue)) + ); + } + + private class QuotationMarkPairMapEqualityComparer : IEqualityComparer>> + { + public bool Equals(KeyValuePair> x, KeyValuePair> y) + { + return x.Key == y.Key && x.Value.Count == y.Value.Count && !x.Value.Except(y.Value).Any(); + } + + public int GetHashCode([DisallowNull] KeyValuePair> obj) + { + return obj.GetHashCode(); + } + } +} diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionTests.cs new file mode 100644 index 00000000..0275abfa --- /dev/null +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionTests.cs @@ -0,0 +1,429 @@ +using NUnit.Framework; + +namespace SIL.Machine.PunctuationAnalysis; + +[TestFixture] +public class QuoteConventionTests +{ + [Test] + public void SingleLevelQuoteConventionNormalize() + { + var englishLevel1QuoteConvention = new SingleLevelQuoteConvention("\u201c", "\u201d"); + SingleLevelQuoteConvention normalizedEnglishLevel1QuoteConvention = englishLevel1QuoteConvention.Normalize(); + Assert.That(normalizedEnglishLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); + Assert.That(normalizedEnglishLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); + + var englishLevel2QuoteConvention = new SingleLevelQuoteConvention("\u2018", "\u2019"); + SingleLevelQuoteConvention normalizedEnglishLevel2QuoteConvention = englishLevel2QuoteConvention.Normalize(); + Assert.That(normalizedEnglishLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("'")); + Assert.That(normalizedEnglishLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("'")); + + var alreadyNormalizedEnglishLevel1QuoteConvention = new SingleLevelQuoteConvention("\"", "\""); + SingleLevelQuoteConvention doublyNormalizedEnglishLevel1QuoteConvention = + alreadyNormalizedEnglishLevel1QuoteConvention.Normalize(); + Assert.That(doublyNormalizedEnglishLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); + Assert.That(doublyNormalizedEnglishLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); + + var alreadyNormalizedEnglishLevel2QuoteConvention = new SingleLevelQuoteConvention("'", "'"); + SingleLevelQuoteConvention doublyNormalizedEnglishLevel2QuoteConvention = + alreadyNormalizedEnglishLevel2QuoteConvention.Normalize(); + Assert.That(doublyNormalizedEnglishLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("'")); + Assert.That(doublyNormalizedEnglishLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("'")); + + var frenchLevel1QuoteConvention = new SingleLevelQuoteConvention("\u00ab", "\u00bb"); + SingleLevelQuoteConvention normalizedFrenchLevel1QuoteConvention = frenchLevel1QuoteConvention.Normalize(); + Assert.That(normalizedFrenchLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); + Assert.That(normalizedFrenchLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); + + var frenchLevel2QuoteConvention = new SingleLevelQuoteConvention("\u2039", "\u203a"); + SingleLevelQuoteConvention normalizedFrenchLevel2QuoteConvention = frenchLevel2QuoteConvention.Normalize(); + Assert.That(normalizedFrenchLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("\u2039")); + Assert.That(normalizedFrenchLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("\u203a")); + + var typewriterFrenchLevel1QuoteConvention = new SingleLevelQuoteConvention("<<", ">>"); + SingleLevelQuoteConvention normalizedTypewriterFrenchLevel1QuoteConvention = + typewriterFrenchLevel1QuoteConvention.Normalize(); + Assert.That(normalizedTypewriterFrenchLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("<<")); + Assert.That(normalizedTypewriterFrenchLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo(">>")); + + var typewriterFrenchLevel2QuoteConvention = new SingleLevelQuoteConvention("<", ">"); + SingleLevelQuoteConvention normalizedTypewriterFrenchLevel2QuoteConvention = + typewriterFrenchLevel2QuoteConvention.Normalize(); + Assert.That(normalizedTypewriterFrenchLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("<")); + Assert.That(normalizedTypewriterFrenchLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo(">")); + + var centralEuropeanLevel1QuoteConvention = new SingleLevelQuoteConvention("\u201e", "\u201c"); + SingleLevelQuoteConvention normalizedCentralEuropeanLevel1QuoteConvention = + centralEuropeanLevel1QuoteConvention.Normalize(); + Assert.That(normalizedCentralEuropeanLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); + Assert.That(normalizedCentralEuropeanLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); + + var centralEuropeanLevel2QuoteConvention = new SingleLevelQuoteConvention("\u201a", "\u2018"); + SingleLevelQuoteConvention normalizedCentralEuropeanLevel2QuoteConvention = + centralEuropeanLevel2QuoteConvention.Normalize(); + Assert.That(normalizedCentralEuropeanLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("'")); + Assert.That(normalizedCentralEuropeanLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("'")); + + var centralEuropeanGuillemetsQuoteConvention = new SingleLevelQuoteConvention("\u00bb", "\u00ab"); + SingleLevelQuoteConvention normalizedCentralEuropeanGuillemetsQuoteConvention = + centralEuropeanGuillemetsQuoteConvention.Normalize(); + Assert.That(normalizedCentralEuropeanGuillemetsQuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); + Assert.That(normalizedCentralEuropeanGuillemetsQuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); + + var swedishLevel1QuoteConvention = new SingleLevelQuoteConvention("\u201d", "\u201d"); + SingleLevelQuoteConvention normalizedSwedishLevel1QuoteConvention = swedishLevel1QuoteConvention.Normalize(); + Assert.That(normalizedSwedishLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); + Assert.That(normalizedSwedishLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); + + var swedishLevel2QuoteConvention = new SingleLevelQuoteConvention("\u2019", "\u2019"); + SingleLevelQuoteConvention normalizedSwedishLevel2QuoteConvention = swedishLevel2QuoteConvention.Normalize(); + Assert.That(normalizedSwedishLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("'")); + Assert.That(normalizedSwedishLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("'")); + + var finnishLevel1QuoteConvention = new SingleLevelQuoteConvention("\u00bb", "\u00bb"); + SingleLevelQuoteConvention normalizedFinnishLevel1QuoteConvention = finnishLevel1QuoteConvention.Normalize(); + Assert.That(normalizedFinnishLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); + Assert.That(normalizedFinnishLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); + + var arabicLevel1QuoteConvention = new SingleLevelQuoteConvention("\u201d", "\u201c"); + SingleLevelQuoteConvention normalizedArabicLevel1QuoteConvention = arabicLevel1QuoteConvention.Normalize(); + Assert.That(normalizedArabicLevel1QuoteConvention.OpeningQuotationMark, Is.EqualTo("\"")); + Assert.That(normalizedArabicLevel1QuoteConvention.ClosingQuotationMark, Is.EqualTo("\"")); + + var arabicLevel2QuoteConvention = new SingleLevelQuoteConvention("\u2019", "\u2018"); + SingleLevelQuoteConvention normalizedArabicLevel2QuoteConvention = arabicLevel2QuoteConvention.Normalize(); + Assert.That(normalizedArabicLevel2QuoteConvention.OpeningQuotationMark, Is.EqualTo("'")); + Assert.That(normalizedArabicLevel2QuoteConvention.ClosingQuotationMark, Is.EqualTo("'")); + } + + [Test] + public void GetNumLevels() + { + var emptyQuoteConvention = new QuoteConvention("empty_quote_convention", []); + Assert.That(emptyQuoteConvention.NumLevels, Is.EqualTo(0)); + + var oneLevelQuoteConvention = new QuoteConvention( + "one_level_quote_convention", + [new SingleLevelQuoteConvention("\u201c", "\u201d")] + ); + Assert.That(oneLevelQuoteConvention.NumLevels, Is.EqualTo(1)); + + var twoLevelQuoteConvention = new QuoteConvention( + "two_level_quote_convention", + [new SingleLevelQuoteConvention("\u201c", "\u201d"), new SingleLevelQuoteConvention("\u2018", "\u2019"),] + ); + Assert.That(twoLevelQuoteConvention.NumLevels, Is.EqualTo(2)); + + var threeLevelQuoteConvention = new QuoteConvention( + "three_level_quote_convention", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201D", "\u201D"), + ] + ); + Assert.That(threeLevelQuoteConvention.NumLevels, Is.EqualTo(3)); + } + + [Test] + public void GetOpeningQuoteAtLevel() + { + var quoteConvention = new QuoteConvention( + "test_quote_convention", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ] + ); + Assert.That(quoteConvention.GetOpeningQuotationMarkAtDepth(1), Is.EqualTo("\u201c")); + Assert.That(quoteConvention.GetOpeningQuotationMarkAtDepth(2), Is.EqualTo("\u2018")); + Assert.That(quoteConvention.GetOpeningQuotationMarkAtDepth(3), Is.EqualTo("\u00ab")); + } + + [Test] + public void GetClosingQuoteAtLevel() + { + var quoteConvention = new QuoteConvention( + "test_quote_convention", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ] + ); + Assert.That(quoteConvention.GetClosingQuotationMarkAtDepth(1), Is.EqualTo("\u201d")); + Assert.That(quoteConvention.GetClosingQuotationMarkAtDepth(2), Is.EqualTo("\u2019")); + Assert.That(quoteConvention.GetClosingQuotationMarkAtDepth(3), Is.EqualTo("\u00bb")); + } + + [Test] + public void GetExpectedQuotationMark() + { + var quoteConvention = new QuoteConvention( + "test_quote_convention", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ] + ); + Assert.That(quoteConvention.GetExpectedQuotationMark(1, QuotationMarkDirection.Opening), Is.EqualTo("\u201c")); + Assert.That(quoteConvention.GetExpectedQuotationMark(1, QuotationMarkDirection.Closing), Is.EqualTo("\u201d")); + Assert.That(quoteConvention.GetExpectedQuotationMark(2, QuotationMarkDirection.Opening), Is.EqualTo("\u2018")); + Assert.That(quoteConvention.GetExpectedQuotationMark(2, QuotationMarkDirection.Closing), Is.EqualTo("\u2019")); + Assert.That(quoteConvention.GetExpectedQuotationMark(3, QuotationMarkDirection.Opening), Is.EqualTo("\u00ab")); + Assert.That(quoteConvention.GetExpectedQuotationMark(3, QuotationMarkDirection.Closing), Is.EqualTo("\u00bb")); + Assert.That(quoteConvention.GetExpectedQuotationMark(4, QuotationMarkDirection.Opening), Is.EqualTo("")); + Assert.That(quoteConvention.GetExpectedQuotationMark(4, QuotationMarkDirection.Closing), Is.EqualTo("")); + Assert.That(quoteConvention.GetExpectedQuotationMark(0, QuotationMarkDirection.Opening), Is.EqualTo("")); + Assert.That(quoteConvention.GetExpectedQuotationMark(0, QuotationMarkDirection.Closing), Is.EqualTo("")); + } + + [Test] + public void IncludesOpeningQuotationMark() + { + var emptyQuoteConvention = new QuoteConvention("empty quote convention", []); + Assert.IsFalse(emptyQuoteConvention.IncludesOpeningQuotationMark("\u201c")); + + var positiveQuoteConvention1 = new QuoteConvention( + "positive quote convention 1", + [new SingleLevelQuoteConvention("\u201c", "\u201d")] + ); + Assert.IsTrue(positiveQuoteConvention1.IncludesOpeningQuotationMark("\u201c")); + + var negativeQuoteConvention1 = new QuoteConvention( + "negative quote convention 1", + [new SingleLevelQuoteConvention("\u2018", "\u2019")] + ); + Assert.IsFalse(negativeQuoteConvention1.IncludesOpeningQuotationMark("\u201c")); + + var negativeQuoteConvention2 = new QuoteConvention( + "negative quote convention 2", + [new SingleLevelQuoteConvention("\u201d", "\u201c")] + ); + Assert.IsFalse(negativeQuoteConvention2.IncludesOpeningQuotationMark("\u201c")); + + var positiveQuoteConvention2 = new QuoteConvention( + "positive quote convention 2", + [new SingleLevelQuoteConvention("\u201c", "\u201d"), new SingleLevelQuoteConvention("\u2018", "\u2019")] + ); + Assert.IsTrue(positiveQuoteConvention2.IncludesOpeningQuotationMark("\u201c")); + + var positiveQuoteConvention3 = new QuoteConvention( + "positive quote convention 3", + [new SingleLevelQuoteConvention("\u2018", "\u2019"), new SingleLevelQuoteConvention("\u201c", "\u201d")] + ); + Assert.IsTrue(positiveQuoteConvention3.IncludesOpeningQuotationMark("\u201c")); + + var negativeQuoteConvention3 = new QuoteConvention( + "negative quote convention 3", + [ + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ] + ); + Assert.IsFalse(negativeQuoteConvention3.IncludesOpeningQuotationMark("\u201c")); + } + + [Test] + public void IncludesClosingQuotationMark() + { + var emptyQuoteConvention = new QuoteConvention("empty quote convention", []); + Assert.IsFalse(emptyQuoteConvention.IncludesClosingQuotationMark("\u201d")); + + var positiveQuoteConvention1 = new QuoteConvention( + "positive quote convention 1", + [new SingleLevelQuoteConvention("\u201c", "\u201d")] + ); + Assert.IsTrue(positiveQuoteConvention1.IncludesClosingQuotationMark("\u201d")); + + var negativeQuoteConvention1 = new QuoteConvention( + "negative quote convention 1", + [new SingleLevelQuoteConvention("\u2018", "\u2019")] + ); + Assert.IsFalse(negativeQuoteConvention1.IncludesClosingQuotationMark("\u201d")); + + var negativeQuoteConvention2 = new QuoteConvention( + "negative quote convention 2", + [new SingleLevelQuoteConvention("\u201d", "\u201c")] + ); + Assert.IsFalse(negativeQuoteConvention2.IncludesClosingQuotationMark("\u201d")); + + var positiveQuoteConvention2 = new QuoteConvention( + "positive quote convention 2", + [new SingleLevelQuoteConvention("\u201c", "\u201d"), new SingleLevelQuoteConvention("\u2018", "\u2019")] + ); + Assert.IsTrue(positiveQuoteConvention2.IncludesClosingQuotationMark("\u201d")); + + var positiveQuoteConvention3 = new QuoteConvention( + "positive quote convention 3", + [new SingleLevelQuoteConvention("\u2018", "\u2019"), new SingleLevelQuoteConvention("\u201c", "\u201d")] + ); + Assert.IsTrue(positiveQuoteConvention3.IncludesClosingQuotationMark("\u201d")); + + var negativeQuoteConvention3 = new QuoteConvention( + "negative quote convention 3", + [ + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ] + ); + Assert.IsFalse(negativeQuoteConvention3.IncludesClosingQuotationMark("\u201d")); + } + + [Test] + public void GetPossibleDepths() + { + var quoteConvention = new QuoteConvention( + "test_quote_convention", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + Assert.That(quoteConvention.GetPossibleDepths("\u201c", QuotationMarkDirection.Opening).SequenceEqual([1, 3])); + Assert.That(quoteConvention.GetPossibleDepths("\u201c", QuotationMarkDirection.Closing), Has.Count.EqualTo(0)); + Assert.That(quoteConvention.GetPossibleDepths("\u2018", QuotationMarkDirection.Opening).SequenceEqual([2, 4])); + Assert.That(quoteConvention.GetPossibleDepths("\u2018", QuotationMarkDirection.Closing), Has.Count.EqualTo(0)); + Assert.That(quoteConvention.GetPossibleDepths("\u201d", QuotationMarkDirection.Opening), Has.Count.EqualTo(0)); + Assert.That(quoteConvention.GetPossibleDepths("\u201d", QuotationMarkDirection.Closing).SequenceEqual([1, 3])); + Assert.That(quoteConvention.GetPossibleDepths("\u2019", QuotationMarkDirection.Opening), Has.Count.EqualTo(0)); + Assert.That(quoteConvention.GetPossibleDepths("\u2019", QuotationMarkDirection.Closing).SequenceEqual([2, 4])); + Assert.That(quoteConvention.GetPossibleDepths("\u00ab", QuotationMarkDirection.Opening), Has.Count.EqualTo(0)); + Assert.That(quoteConvention.GetPossibleDepths("\u00ab", QuotationMarkDirection.Closing), Has.Count.EqualTo(0)); + } + + [Test] + public void IsCompatibleWithObservedQuotationMarks() + { + var quoteConvention = new QuoteConvention( + "test_quote_convention", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ] + ); + Assert.IsTrue( + quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c", "\u2018"], ["\u201d", "\u2019"]) + ); + Assert.IsTrue( + quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c", "\u00ab"], ["\u201d", "\u00bb"]) + ); + Assert.IsTrue(quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c"], ["\u201d", "\u2019"])); + Assert.IsTrue(quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c"], ["\u201d"])); + Assert.IsTrue( + quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c", "\u00ab"], ["\u201d", "\u2019"]) + ); + + Assert.IsFalse(quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201d", "\u2019"], ["\u201c"])); + + Assert.IsFalse(quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c", "\u201e"], ["\u201d"])); + + Assert.IsFalse( + quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c", "\u2018"], ["\u201d", "\u201f"]) + ); + + // must have observed the first_level quotes + Assert.IsFalse(quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u2018"], ["\u201d"])); + Assert.IsFalse(quoteConvention.IsCompatibleWithObservedQuotationMarks(["\u201c", "\u2018"], ["\u00ab"])); + } + + [Test] + public void Normalize() + { + var emptyQuoteConvention = new QuoteConvention("empty_quote_convention", []); + QuoteConvention normalizedEmptyQuoteConvention = emptyQuoteConvention.Normalize(); + Assert.That(normalizedEmptyQuoteConvention.Name, Is.EqualTo("empty_quote_convention_normalized")); + Assert.That(normalizedEmptyQuoteConvention.NumLevels, Is.EqualTo(0)); + + var standardEnglishQuoteConvention = new QuoteConvention( + "standard_english_quote_convention", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + QuoteConvention normalizedStandardEnglishQuoteConvention = standardEnglishQuoteConvention.Normalize(); + Assert.That( + normalizedStandardEnglishQuoteConvention.Name, + Is.EqualTo("standard_english_quote_convention_normalized") + ); + Assert.That(normalizedStandardEnglishQuoteConvention.NumLevels, Is.EqualTo(4)); + Assert.That(normalizedStandardEnglishQuoteConvention.GetOpeningQuotationMarkAtDepth(1), Is.EqualTo("\"")); + Assert.That(normalizedStandardEnglishQuoteConvention.GetClosingQuotationMarkAtDepth(1), Is.EqualTo("\"")); + Assert.That(normalizedStandardEnglishQuoteConvention.GetOpeningQuotationMarkAtDepth(2), Is.EqualTo("'")); + Assert.That(normalizedStandardEnglishQuoteConvention.GetClosingQuotationMarkAtDepth(2), Is.EqualTo("'")); + Assert.That(normalizedStandardEnglishQuoteConvention.GetOpeningQuotationMarkAtDepth(3), Is.EqualTo("\"")); + Assert.That(normalizedStandardEnglishQuoteConvention.GetClosingQuotationMarkAtDepth(3), Is.EqualTo("\"")); + Assert.That(normalizedStandardEnglishQuoteConvention.GetOpeningQuotationMarkAtDepth(4), Is.EqualTo("'")); + Assert.That(normalizedStandardEnglishQuoteConvention.GetClosingQuotationMarkAtDepth(4), Is.EqualTo("'")); + + var westernEuropeanQuoteConvention = new QuoteConvention( + "test_quote_convention", + [ + new SingleLevelQuoteConvention("\u201c", "\u201d"), + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u2018", "\u2019"), + ] + ); + QuoteConvention normalizedWesternEuropeanQuoteConvention = westernEuropeanQuoteConvention.Normalize(); + Assert.That(normalizedWesternEuropeanQuoteConvention.Name, Is.EqualTo("test_quote_convention_normalized")); + Assert.That(normalizedWesternEuropeanQuoteConvention.NumLevels, Is.EqualTo(3)); + Assert.That(normalizedWesternEuropeanQuoteConvention.GetOpeningQuotationMarkAtDepth(1), Is.EqualTo("\"")); + Assert.That(normalizedWesternEuropeanQuoteConvention.GetClosingQuotationMarkAtDepth(1), Is.EqualTo("\"")); + Assert.That(normalizedWesternEuropeanQuoteConvention.GetOpeningQuotationMarkAtDepth(2), Is.EqualTo("\"")); + Assert.That(normalizedWesternEuropeanQuoteConvention.GetClosingQuotationMarkAtDepth(2), Is.EqualTo("\"")); + Assert.That(normalizedWesternEuropeanQuoteConvention.GetOpeningQuotationMarkAtDepth(3), Is.EqualTo("'")); + Assert.That(normalizedWesternEuropeanQuoteConvention.GetClosingQuotationMarkAtDepth(3), Is.EqualTo("'")); + + var hybridBritishTypewriterEnglishQuoteConvention = new QuoteConvention( + "hybrid_british_typewriter_english_quote_convention", + [ + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("'", "'"), + new SingleLevelQuoteConvention("\"", "\""), + ] + ); + + QuoteConvention normalizedHybridBritishTypewriterEnglishQuoteConvention = ( + hybridBritishTypewriterEnglishQuoteConvention.Normalize() + ); + Assert.IsTrue( + normalizedHybridBritishTypewriterEnglishQuoteConvention.Name + == "hybrid_british_typewriter_english_quote_convention_normalized" + ); + Assert.That(normalizedHybridBritishTypewriterEnglishQuoteConvention.NumLevels, Is.EqualTo(3)); + Assert.That( + normalizedHybridBritishTypewriterEnglishQuoteConvention.GetOpeningQuotationMarkAtDepth(1), + Is.EqualTo("\"") + ); + Assert.That( + normalizedHybridBritishTypewriterEnglishQuoteConvention.GetClosingQuotationMarkAtDepth(1), + Is.EqualTo("\"") + ); + Assert.That( + normalizedHybridBritishTypewriterEnglishQuoteConvention.GetOpeningQuotationMarkAtDepth(2), + Is.EqualTo("'") + ); + Assert.That( + normalizedHybridBritishTypewriterEnglishQuoteConvention.GetClosingQuotationMarkAtDepth(2), + Is.EqualTo("'") + ); + Assert.That( + normalizedHybridBritishTypewriterEnglishQuoteConvention.GetOpeningQuotationMarkAtDepth(3), + Is.EqualTo("\"") + ); + Assert.That( + normalizedHybridBritishTypewriterEnglishQuoteConvention.GetClosingQuotationMarkAtDepth(3), + Is.EqualTo("\"") + ); + } +} diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs new file mode 100644 index 00000000..2870d81e --- /dev/null +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs @@ -0,0 +1,312 @@ +using NUnit.Framework; +using SIL.Machine.Corpora; + +namespace SIL.Machine.PunctuationAnalysis; + +[TestFixture] +public class TextSegmentTests +{ + [Test] + public void BuilderInitialization() + { + var builder = new TextSegment.Builder(); + TextSegment textSegment = builder.Build(); + + Assert.That(textSegment.Text, Is.EqualTo("")); + Assert.IsNull(textSegment.PreviousSegment); + Assert.IsNull(textSegment.NextSegment); + Assert.IsTrue(textSegment.ImmediatePrecedingMarker is UsfmMarkerType.NoMarker); + Assert.That(textSegment.MarkersInPrecedingContext, Has.Count.EqualTo(0)); + Assert.That(textSegment.IndexInVerse, Is.EqualTo(0)); + Assert.That(textSegment.NumSegmentsInVerse, Is.EqualTo(0)); + Assert.IsNull(textSegment.UsfmToken); + } + + [Test] + public void BuilderSetText() + { + var builder = new TextSegment.Builder(); + string text = "Example text"; + builder.SetText(text); + + Assert.That(builder.Build().Text, Is.EqualTo(text)); + } + + [Test] + public void BuilderSetPreviousSegment() + { + var builder = new TextSegment.Builder(); + TextSegment previousSegment = new TextSegment.Builder().SetText("previous segment text").Build(); + builder.SetPreviousSegment(previousSegment); + TextSegment textSegment = builder.Build(); + + Assert.That(textSegment.PreviousSegment, Is.EqualTo(previousSegment)); + Assert.IsNull(textSegment.NextSegment); + Assert.IsTrue(textSegment.ImmediatePrecedingMarker is UsfmMarkerType.NoMarker); + Assert.That(textSegment.MarkersInPrecedingContext, Has.Count.EqualTo(0)); + Assert.That(textSegment.IndexInVerse, Is.EqualTo(0)); + Assert.That(textSegment.NumSegmentsInVerse, Is.EqualTo(0)); + } + + [Test] + public void BuilderAddPrecedingMarker() + { + var builder = new TextSegment.Builder(); + builder.AddPrecedingMarker(UsfmMarkerType.Chapter); + TextSegment textSegment = builder.Build(); + + Assert.IsTrue(textSegment.ImmediatePrecedingMarker is UsfmMarkerType.Chapter); + Assert.That(textSegment.MarkersInPrecedingContext.SequenceEqual([UsfmMarkerType.Chapter])); + Assert.IsNull(textSegment.PreviousSegment); + Assert.IsNull(textSegment.NextSegment); + + builder.AddPrecedingMarker(UsfmMarkerType.Verse); + textSegment = builder.Build(); + + Assert.That(textSegment.ImmediatePrecedingMarker, Is.EqualTo(UsfmMarkerType.Verse)); + Assert.That( + textSegment.MarkersInPrecedingContext.SequenceEqual([UsfmMarkerType.Chapter, UsfmMarkerType.Verse,]) + ); + Assert.IsNull(textSegment.PreviousSegment); + Assert.IsNull(textSegment.NextSegment); + } + + [Test] + public void BuilderSetUsfmToken() + { + var builder = new TextSegment.Builder(); + builder.SetUsfmToken(new UsfmToken("USFM token text")); + TextSegment textSegment = builder.Build(); + + Assert.IsNotNull(textSegment.UsfmToken); + Assert.That(textSegment.UsfmToken.Type, Is.EqualTo(UsfmTokenType.Text)); + Assert.That(textSegment.UsfmToken.Text, Is.EqualTo("USFM token text")); + Assert.That(textSegment.Text, Is.EqualTo("")); + Assert.IsNull(textSegment.PreviousSegment); + Assert.IsNull(textSegment.NextSegment); + } + + [Test] + public void Equals() + { + TextSegment basicSegment = new TextSegment.Builder().SetText("text1").Build(); + TextSegment sameTextSegment = new TextSegment.Builder().SetText("text1").Build(); + TextSegment differentTextSegment = new TextSegment.Builder().SetText("different text").Build(); +#pragma warning disable NUnit2009 // The same value has been provided as both the actual and the expected argument + Assert.That(basicSegment, Is.EqualTo(basicSegment)); +#pragma warning restore NUnit2009 // The same value has been provided as both the actual and the expected argument +#pragma warning disable NUnit2021 // Incompatible types for EqualTo constraint + Assert.That(basicSegment, Is.Not.EqualTo(new UsfmToken("text1"))); +#pragma warning restore NUnit2021 // Incompatible types for EqualTo constraint + Assert.That(basicSegment, Is.EqualTo(sameTextSegment)); + Assert.That(basicSegment, Is.Not.EqualTo(differentTextSegment)); + + TextSegment segmentWithIndex = new TextSegment.Builder().SetText("text1").Build(); + segmentWithIndex.IndexInVerse = 1; + TextSegment segmentWithSameIndex = new TextSegment.Builder().SetText("text1").Build(); + segmentWithSameIndex.IndexInVerse = 1; + TextSegment segmentWithDifferentIndex = new TextSegment.Builder().SetText("text1").Build(); + segmentWithDifferentIndex.IndexInVerse = 2; + + Assert.That(segmentWithIndex, Is.EqualTo(segmentWithSameIndex)); + Assert.That(segmentWithIndex, Is.Not.EqualTo(segmentWithDifferentIndex)); + Assert.That(segmentWithIndex, Is.Not.EqualTo(basicSegment)); + + TextSegment segmentWithPrecedingMarker = ( + new TextSegment.Builder().SetText("text1").AddPrecedingMarker(UsfmMarkerType.Verse).Build() + ); + TextSegment segmentWithSamePrecedingMarker = ( + new TextSegment.Builder().SetText("text1").AddPrecedingMarker(UsfmMarkerType.Verse).Build() + ); + TextSegment segmentWithDifferentPrecedingMarker = ( + new TextSegment.Builder().SetText("text1").AddPrecedingMarker(UsfmMarkerType.Chapter).Build() + ); + TextSegment segmentWithMultiplePrecedingMarkers = ( + new TextSegment.Builder() + .SetText("text1") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build() + ); + + var usfmToken = new UsfmToken("USFM token text"); + TextSegment segmentWithUsfmToken = new TextSegment.Builder().SetText("text1").SetUsfmToken(usfmToken).Build(); + TextSegment segmentWithSameUsfmToken = new TextSegment.Builder() + .SetText("text1") + .SetUsfmToken(usfmToken) + .Build(); + TextSegment segmentWithDifferentUsfmToken = ( + new TextSegment.Builder().SetText("text1").SetUsfmToken(new UsfmToken("Different USFM token text")).Build() + ); + + Assert.That(segmentWithUsfmToken, Is.EqualTo(segmentWithSameUsfmToken)); + Assert.IsTrue(segmentWithUsfmToken != segmentWithDifferentUsfmToken); + Assert.IsTrue(basicSegment != segmentWithUsfmToken); + + // attributes that are not used in equality checks + TextSegment segmentWithNumVerses = new TextSegment.Builder().SetText("text1").Build(); + segmentWithNumVerses.NumSegmentsInVerse = 3; + TextSegment segmentWithSameNumVerses = new TextSegment.Builder().SetText("text1").Build(); + segmentWithSameNumVerses.NumSegmentsInVerse = 3; + TextSegment segmentWithDifferentNumVerses = new TextSegment.Builder().SetText("text1").Build(); + segmentWithDifferentNumVerses.NumSegmentsInVerse = 4; + + Assert.That(segmentWithNumVerses, Is.EqualTo(segmentWithSameNumVerses)); + Assert.That(segmentWithNumVerses, Is.Not.EqualTo(segmentWithDifferentNumVerses)); + Assert.That(segmentWithNumVerses, Is.Not.EqualTo(basicSegment)); + + Assert.That(segmentWithPrecedingMarker, Is.EqualTo(segmentWithSamePrecedingMarker)); + Assert.That(segmentWithPrecedingMarker, Is.Not.EqualTo(segmentWithDifferentPrecedingMarker)); + Assert.That(segmentWithPrecedingMarker, Is.EqualTo(segmentWithMultiplePrecedingMarkers)); + Assert.That(segmentWithPrecedingMarker, Is.Not.EqualTo(basicSegment)); + + TextSegment segmentWithPreviousSegment = new TextSegment.Builder().SetText("text1").Build(); + segmentWithPreviousSegment.PreviousSegment = segmentWithNumVerses; + + TextSegment segmentWithNextSegment = new TextSegment.Builder().SetText("text1").Build(); + segmentWithNextSegment.NextSegment = segmentWithNumVerses; + + Assert.That(basicSegment, Is.EqualTo(segmentWithPreviousSegment)); + Assert.That(basicSegment, Is.EqualTo(segmentWithNextSegment)); + } + + [Test] + public void GetText() + { + TextSegment textSegment = new TextSegment.Builder().SetText("example text").Build(); + Assert.That(textSegment.Text, Is.EqualTo("example text")); + + textSegment = new TextSegment.Builder().SetText("new example text").Build(); + Assert.That(textSegment.Text, Is.EqualTo("new example text")); + } + + [Test] + public void Length() + { + TextSegment textSegment = new TextSegment.Builder().SetText("example text").Build(); + Assert.That(textSegment.Length, Is.EqualTo("example text".Length)); + + textSegment = new TextSegment.Builder().SetText("new example text").Build(); + Assert.That(textSegment.Length, Is.EqualTo("new example text".Length)); + } + + [Test] + public void SubstringBefore() + { + TextSegment textSegment = new TextSegment.Builder().SetText("example text").Build(); + Assert.That(textSegment.SubstringBefore(7), Is.EqualTo("example")); + Assert.That(textSegment.SubstringBefore(8), Is.EqualTo("example ")); + Assert.That(textSegment.SubstringBefore(0), Is.EqualTo("")); + Assert.That(textSegment.SubstringBefore(12), Is.EqualTo("example text")); + } + + [Test] + public void SubstringAfter() + { + TextSegment textSegment = new TextSegment.Builder().SetText("example text").Build(); + Assert.That(textSegment.SubstringAfter(7), Is.EqualTo(" text")); + Assert.That(textSegment.SubstringAfter(8), Is.EqualTo("text")); + Assert.That(textSegment.SubstringAfter(0), Is.EqualTo("example text")); + Assert.That(textSegment.SubstringAfter(12), Is.EqualTo("")); + Assert.That(textSegment.SubstringAfter(11), Is.EqualTo("t")); + } + + [Test] + public void IsMarkerInPrecedingContext() + { + TextSegment noPrecedingMarkerSegment = new TextSegment.Builder().SetText("example text").Build(); + Assert.IsFalse(noPrecedingMarkerSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Chapter)); + Assert.IsFalse(noPrecedingMarkerSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)); + Assert.IsFalse(noPrecedingMarkerSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Character)); + + TextSegment onePrecedingMarkerTextSegment = ( + new TextSegment.Builder().SetText("example text").AddPrecedingMarker(UsfmMarkerType.Character).Build() + ); + + Assert.IsTrue(onePrecedingMarkerTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Character)); + Assert.IsFalse(onePrecedingMarkerTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)); + Assert.IsFalse(onePrecedingMarkerTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Chapter)); + + TextSegment twoPrecedingMarkersTextSegment = ( + new TextSegment.Builder() + .SetText("example text") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build() + ); + Assert.IsTrue(twoPrecedingMarkersTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Chapter)); + Assert.IsTrue(twoPrecedingMarkersTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)); + Assert.IsFalse(twoPrecedingMarkersTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Character)); + + TextSegment threePrecedingMarkersTextSegment = ( + new TextSegment.Builder() + .SetText("example text") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Character) + .Build() + ); + Assert.IsTrue(threePrecedingMarkersTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Chapter)); + Assert.IsTrue(threePrecedingMarkersTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)); + Assert.IsTrue(threePrecedingMarkersTextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Character)); + } + + [Test] + public void IsFirstSegmentInVerse() + { + TextSegment textSegment = new TextSegment.Builder().SetText("example text").Build(); + textSegment.IndexInVerse = 0; + Assert.IsTrue(textSegment.IsFirstSegmentInVerse()); + + textSegment.IndexInVerse = 1; + Assert.IsFalse(textSegment.IsFirstSegmentInVerse()); + } + + [Test] + public void IsLastSegmentInVerse() + { + TextSegment textSegment = new TextSegment.Builder().SetText("example text").Build(); + textSegment.IndexInVerse = 0; + textSegment.NumSegmentsInVerse = 1; + Assert.IsTrue(textSegment.IsLastSegmentInVerse()); + + textSegment.IndexInVerse = 0; + textSegment.NumSegmentsInVerse = 2; + Assert.IsFalse(textSegment.IsLastSegmentInVerse()); + + textSegment.IndexInVerse = 1; + Assert.IsTrue(textSegment.IsLastSegmentInVerse()); + } + + [Test] + public void ReplaceSubstring() + { + TextSegment textSegment = new TextSegment.Builder().SetText("example text").Build(); + textSegment.ReplaceSubstring(0, 7, "sample"); + Assert.That(textSegment.Text, Is.EqualTo("sample text")); + + textSegment.ReplaceSubstring(7, 11, "text"); + Assert.That(textSegment.Text, Is.EqualTo("sample text")); + + textSegment.ReplaceSubstring(0, 7, ""); + Assert.That(textSegment.Text, Is.EqualTo("text")); + + textSegment.ReplaceSubstring(0, 4, "new'"); + Assert.That(textSegment.Text, Is.EqualTo("new'")); + + textSegment.ReplaceSubstring(3, 4, "\u2019"); + Assert.That(textSegment.Text, Is.EqualTo("new\u2019")); + + textSegment.ReplaceSubstring(0, 0, "prefix "); + Assert.That(textSegment.Text, Is.EqualTo("prefix new\u2019")); + + textSegment.ReplaceSubstring(0, 0, ""); + Assert.That(textSegment.Text, Is.EqualTo("prefix new\u2019")); + + textSegment.ReplaceSubstring(11, 11, " suffix"); + Assert.That(textSegment.Text, Is.EqualTo("prefix new\u2019 suffix")); + + textSegment.ReplaceSubstring(6, 6, "-"); + Assert.That(textSegment.Text, Is.EqualTo("prefix- new\u2019 suffix")); + } +} diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs new file mode 100644 index 00000000..6615ec92 --- /dev/null +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs @@ -0,0 +1,497 @@ +using NUnit.Framework; +using SIL.Machine.Corpora; +using SIL.Scripture; + +namespace SIL.Machine.PunctuationAnalysis; + +[TestFixture] +public class UsfmStructureExtractorTests +{ + private MockUsfmParserState _verseTextParserState; + + [SetUp] + public void SetUp() + { + _verseTextParserState = new MockUsfmParserState(new UsfmStylesheet("usfm.sty"), ScrVers.English, []); + _verseTextParserState.SetVerseNum(1); + } + + [Test] + public void ChapterAndVerseMarkers() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build() + ] + ) + ] + ) + ]; + + List actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void StartParagraphMarker() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.StartPara(_verseTextParserState, "p", false, null); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build() + ] + ) + ] + ) + ]; + + List actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void StartCharacterMarker() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.StartChar(_verseTextParserState, "k", false, null); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Character) + .Build() + ] + ) + ] + ) + ]; + + List actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void EndCharacterMarker() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.EndChar(_verseTextParserState, "k", null, false); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Character) + .Build() + ] + ) + ] + ) + ]; + + List actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void EndNoteMarker() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.EndNote(_verseTextParserState, "f", false); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Embed) + .Build() + ] + ) + ] + ) + ]; + + List actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void EndTableMarker() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.EndNote(_verseTextParserState, "tr", false); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Embed) + .Build() + ] + ) + ] + ) + ]; + + List actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void RefMarker() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.EndNote(_verseTextParserState, "x", false); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Embed) + .Build() + ] + ) + ] + ) + ]; + + List actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void SidebarMarker() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.EndNote(_verseTextParserState, "esb", false); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Embed) + .Build() + ] + ) + ] + ) + ]; + + List actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void MultipleVerses() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + usfmStructureExtractor.Verse(_verseTextParserState, "2", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test2"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build() + ] + ), + new Verse( + [ + new TextSegment.Builder() + .SetText("test2") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build() + ] + ), + ] + ) + ]; + + List actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + Assert.IsNull(actualChapters[0].Verses[1].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[1].TextSegments[0].NextSegment); + } + + [Test] + public void MultipleChapters() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + usfmStructureExtractor.Chapter(_verseTextParserState, "2", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test2"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build() + ] + ), + ] + ), + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test2") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build() + ] + ), + ] + ), + ]; + + List actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[0].Verses[0].TextSegments[0].NextSegment); + Assert.IsNull(actualChapters[1].Verses[0].TextSegments[0].PreviousSegment); + Assert.IsNull(actualChapters[1].Verses[0].TextSegments[0].NextSegment); + } + + [Test] + public void CharacterMarkerInText() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + usfmStructureExtractor.StartChar(_verseTextParserState, "k", false, null); + usfmStructureExtractor.Text(_verseTextParserState, "test2"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build(), + new TextSegment.Builder() + .SetText("test2") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Character) + .Build(), + ] + ), + ] + ) + ]; + + List actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.That( + actualChapters[0].Verses[0].TextSegments[1].PreviousSegment, + Is.EqualTo(expectedChapters[0].Verses[0].TextSegments[0]) + ); + Assert.That( + actualChapters[0].Verses[0].TextSegments[0].NextSegment, + Is.EqualTo(expectedChapters[0].Verses[0].TextSegments[1]) + ); + } + + [Test] + public void EmptyText() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + usfmStructureExtractor.StartChar(_verseTextParserState, "k", false, null); + usfmStructureExtractor.Text(_verseTextParserState, ""); + usfmStructureExtractor.EndChar(_verseTextParserState, "k", null, false); + usfmStructureExtractor.Text(_verseTextParserState, "test2"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build(), + new TextSegment.Builder() + .SetText("test2") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Character) + .Build(), + ] + ), + ] + ) + ]; + + List actualChapters = usfmStructureExtractor.GetChapters(); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.That( + actualChapters[0].Verses[0].TextSegments[1].PreviousSegment, + Is.EqualTo(expectedChapters[0].Verses[0].TextSegments[0]) + ); + Assert.That( + actualChapters[0].Verses[0].TextSegments[0].NextSegment, + Is.EqualTo(expectedChapters[0].Verses[0].TextSegments[1]) + ); + } + + private static void AssertChapterEqual(List expectedChapters, List actualChapters) + { + Assert.That(expectedChapters.Count, Is.EqualTo(actualChapters.Count)); + foreach ((Chapter expectedChapter, Chapter actualChapter) in expectedChapters.Zip(actualChapters)) + { + Assert.That(expectedChapter.Verses.Count, Is.EqualTo(actualChapter.Verses.Count)); + foreach ((Verse expectedVerse, Verse actualVerse) in expectedChapter.Verses.Zip(actualChapter.Verses)) + { + Assert.That(expectedVerse.TextSegments.Count, Is.EqualTo(actualVerse.TextSegments.Count)); + foreach ( + (TextSegment expectedSegment, TextSegment actualSegment) in expectedVerse.TextSegments.Zip( + actualVerse.TextSegments + ) + ) + { + Assert.That(expectedSegment, Is.EqualTo(actualSegment)); + } + } + } + } + + private class MockUsfmParserState(UsfmStylesheet stylesheet, ScrVers versification, IReadOnlyList tokens) + : UsfmParserState(stylesheet, versification, tokens) + { + public void SetVerseNum(int verseNum) + { + VerseRef vref = VerseRef; + vref.VerseNum = verseNum; + VerseRef = vref; + } + } +} diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/VerseTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/VerseTests.cs new file mode 100644 index 00000000..4352c814 --- /dev/null +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/VerseTests.cs @@ -0,0 +1,57 @@ +using NUnit.Framework; + +namespace SIL.Machine.PunctuationAnalysis; + +[TestFixture] +public class VerseTests +{ + [Test] + public void InitializeVerse() + { + List textSegments = + [ + new TextSegment.Builder().SetText("Segment 1").Build(), + new TextSegment.Builder().SetText("Segment 2").Build(), + new TextSegment.Builder().SetText("Segment 3").Build(), + ]; + + var verse = new Verse(textSegments); + + Assert.That(verse.TextSegments, Has.Count.EqualTo(3)); + Assert.That(verse.TextSegments, Is.EqualTo(textSegments)); + } + + [Test] + public void SegmentIndices() + { + List textSegments = + [ + new TextSegment.Builder().SetText("Segment 1").Build(), + new TextSegment.Builder().SetText("Segment 1").Build(), + new TextSegment.Builder().SetText("Segment 1").Build(), + ]; + + var verse = new Verse(textSegments); + + Assert.That(verse.TextSegments[0].IndexInVerse, Is.EqualTo(0)); + Assert.That(verse.TextSegments[1].IndexInVerse, Is.EqualTo(1)); + Assert.That(verse.TextSegments[2].IndexInVerse, Is.EqualTo(2)); + } + + [Test] + public void NumSegmentsInVerse() + { + List textSegments = + [ + new TextSegment.Builder().SetText("Segment 1").Build(), + new TextSegment.Builder().SetText("Segment 2").Build(), + new TextSegment.Builder().SetText("Segment 3").Build(), + ]; + + var verse = new Verse(textSegments); + + Assert.That(verse.TextSegments[0].NumSegmentsInVerse, Is.EqualTo(3)); + Assert.That(verse.TextSegments[1].NumSegmentsInVerse, Is.EqualTo(3)); + Assert.That(verse.TextSegments[2].NumSegmentsInVerse, Is.EqualTo(3)); + } +}