Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,29 +22,12 @@ ParatextProjectSettings settings
_paratextProjectFileHandler = paratextProjectFileHandler;
}

public QuoteConventionAnalysis GetQuoteConventionAnalysis(QuoteConventionDetector handler = null)
{
Dictionary<int, List<int>> includeChapters = null;
return GetQuoteConventionAnalysis(handler, includeChapters);
}

public QuoteConventionAnalysis GetQuoteConventionAnalysis(
QuoteConventionDetector handler = null,
IReadOnlyDictionary<string, List<int>> includeChapters = null
)
{
return GetQuoteConventionAnalysis(
handler,
includeChapters?.ToDictionary(kvp => Canon.BookIdToNumber(kvp.Key), kvp => kvp.Value)
);
}

public QuoteConventionAnalysis GetQuoteConventionAnalysis(
QuoteConventionDetector handler = null,
IReadOnlyDictionary<int, List<int>> includeChapters = null
)
{
handler = handler ?? new QuoteConventionDetector();
var bookQuoteConventionsAnalyses = new List<QuoteConventionAnalysis>();

foreach (
string bookId in Canon
.AllBookNumbers.Where(num => Canon.IsCanonical(num))
Expand All @@ -54,12 +37,14 @@ string bookId in Canon
if (includeChapters != null && !includeChapters.ContainsKey(Canon.BookIdToNumber(bookId)))
continue;

var handler = new QuoteConventionDetector();

string fileName = _settings.GetBookFileName(bookId);
if (!Exists(fileName))
if (!_paratextProjectFileHandler.Exists(fileName))
continue;

string usfm;
using (var reader = new StreamReader(Open(fileName)))
using (var reader = new StreamReader(_paratextProjectFileHandler.Open(fileName)))
{
usfm = reader.ReadToEnd();
}
Expand All @@ -77,12 +62,9 @@ string bookId in Canon
sb.Append($". Error: '{ex.Message}'");
throw new InvalidOperationException(sb.ToString(), ex);
}
bookQuoteConventionsAnalyses.Add(handler.DetectQuoteConvention(includeChapters));
}
return handler.DetectQuoteConvention(includeChapters);
return QuoteConventionAnalysis.CombineWithWeightedAverage(bookQuoteConventionsAnalyses);
}

private bool Exists(string fileName) => _paratextProjectFileHandler.Exists(fileName);

private Stream Open(string fileName) => _paratextProjectFileHandler.Open(fileName);
}
}
85 changes: 72 additions & 13 deletions src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using SIL.Extensions;

Expand All @@ -23,6 +23,16 @@ public void CountQuotationMark(string quotationMark)
TotalCount++;
}

public void CountFrom(QuotationMarkCounts quotationMarkCounts)
{
foreach (KeyValuePair<string, int> kvp in quotationMarkCounts._quotationMarkCounter)
{
(string quotationMark, int count) = (kvp.Key, kvp.Value);
_quotationMarkCounter.UpdateValue(quotationMark, () => 0, i => i + count);
}
TotalCount += quotationMarkCounts.TotalCount;
}

public (string BestString, int BestStringCount, int TotalStringCount) FindBestQuotationMarkProportion()
{
string bestString = _quotationMarkCounter.MaxBy(kvp => kvp.Value).Key;
Expand Down Expand Up @@ -60,6 +70,29 @@ public void Tabulate(List<QuotationMarkMetadata> quotationMarks)
}
}

public void TabulateFrom(QuotationMarkTabulator tabulatedQuotationMarks)
{
foreach (
(
(int depth, QuotationMarkDirection direction),
QuotationMarkCounts otherCounts
) in tabulatedQuotationMarks._quotationCountsByDepthAndDirection.Select(kvp => (kvp.Key, kvp.Value))
)
{
if (
!_quotationCountsByDepthAndDirection.TryGetValue(
(depth, direction),
out QuotationMarkCounts thisCounts
)
)
{
thisCounts = new QuotationMarkCounts();
_quotationCountsByDepthAndDirection[(depth, direction)] = thisCounts;
}
thisCounts.CountFrom(otherCounts);
}
}

private void CountQuotationMark(QuotationMarkMetadata quote)
{
(int Depth, QuotationMarkDirection Direction) key = (quote.Depth, quote.Direction);
Expand All @@ -75,26 +108,52 @@ private void CountQuotationMark(QuotationMarkMetadata quote)
);
}

public int GetTotalQuotationMarkCount()
{
return _quotationCountsByDepthAndDirection.Values.Select(c => c.TotalCount).Sum();
}

public double CalculateSimilarity(QuoteConvention quoteConvention)
{
double weightedDifference = 0.0;
double totalWeight = 0.0;
foreach ((int depth, QuotationMarkDirection direction) in _quotationCountsByDepthAndDirection.Keys)
var numMarksByDepth = new Dictionary<int, int>();
var numMatchingMarksByDepth = new Dictionary<int, int>();
foreach (
(int depth, QuotationMarkDirection direction) in _quotationCountsByDepthAndDirection.Keys.OrderBy(k =>
k
)
)
{
string expectedQuotationMark = quoteConvention.GetExpectedQuotationMark(depth, direction);

// Give higher weight to shallower depths, since deeper marks are more likely to be mistakes
weightedDifference += (
_quotationCountsByDepthAndDirection[(depth, direction)]
.CalculateNumDifferences(expectedQuotationMark) * Math.Pow(2, -depth)
int numMatchingMarks = _quotationCountsByDepthAndDirection[(depth, direction)].TotalCount;
numMarksByDepth.UpdateValue(depth, () => 0, i => i + numMatchingMarks);
numMatchingMarksByDepth.UpdateValue(
depth,
() => 0,
i =>
i
+ numMatchingMarks
- _quotationCountsByDepthAndDirection[(depth, direction)]
.CalculateNumDifferences(expectedQuotationMark)
);
totalWeight += _quotationCountsByDepthAndDirection[(depth, direction)].TotalCount * Math.Pow(2, -depth);
}
if (totalWeight == 0.0)

// The scores of greater depths depend on the scores of shallower depths
var scoresByDepth = new Dictionary<int, double>();
foreach (int depth in numMarksByDepth.Keys.OrderBy(k => k))
{
return 0.0;
double previousDepthScore = 1;
if (scoresByDepth.TryGetValue(depth - 1, out double score))
{
previousDepthScore = score / numMarksByDepth[depth - 1];
}
scoresByDepth[depth] = previousDepthScore * numMatchingMarksByDepth[depth];
}
return 1 - (weightedDifference / totalWeight);
int totalMarks = numMarksByDepth.Values.Sum();
double totalScore = scoresByDepth.Values.Sum();

if (totalMarks == 0)
return 0;
return totalScore / totalMarks;
}

private bool DepthAndDirectionObserved(int depth, QuotationMarkDirection direction)
Expand Down
18 changes: 18 additions & 0 deletions src/SIL.Machine/PunctuationAnalysis/QuoteConvention.cs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ out char quote
: ClosingQuotationMark;
return new SingleLevelQuoteConvention(normalizedOpeningQuotationMark, normalizedClosingQuotationMark);
}

public override int GetHashCode()
{
int hashCode = 23;
hashCode = hashCode * 31 + OpeningQuotationMark.GetHashCode();
hashCode = hashCode * 31 + ClosingQuotationMark.GetHashCode();
return hashCode;
}
}

public class QuoteConvention
Expand Down Expand Up @@ -150,5 +158,15 @@ public QuoteConvention Normalize()
{
return new QuoteConvention(Name + "_normalized", LevelConventions.Select(l => l.Normalize()).ToList());
}

public override int GetHashCode()
{
int hashCode = 23;
foreach (SingleLevelQuoteConvention quoteConvention in LevelConventions)
{
hashCode = hashCode * 31 + quoteConvention.GetHashCode();
}
return hashCode;
}
}
}
100 changes: 100 additions & 0 deletions src/SIL.Machine/PunctuationAnalysis/QuoteConventionAnalysis.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
using System.Collections.Generic;
using System.Linq;
using SIL.Extensions;

namespace SIL.Machine.PunctuationAnalysis
{
public class QuoteConventionAnalysis
{
public QuoteConvention BestQuoteConvention { get; private set; }
public double BestQuoteConventionScore { get; private set; }
public string AnalysisSummary { get; private set; }
public IReadOnlyDictionary<QuoteConvention, double> ConventionScores { get; private set; }
public QuotationMarkTabulator TabulatedQuotationMarks { get; private set; }
public double AnalysisWeight { get; private set; }

public QuoteConventionAnalysis(
Dictionary<QuoteConvention, double> conventionScores,
QuotationMarkTabulator tabulatedQuotationMarks,
double analysisWeight = 1.0
)
{
ConventionScores = conventionScores;
if (ConventionScores.Count > 0)
{
KeyValuePair<QuoteConvention, double> maxKvp = ConventionScores.MaxBy(kvp => kvp.Value);
(BestQuoteConvention, BestQuoteConventionScore) = (maxKvp.Key, maxKvp.Value);
}
else
{
BestQuoteConventionScore = 0;
BestQuoteConvention = null;
}
TabulatedQuotationMarks = tabulatedQuotationMarks;
AnalysisWeight = analysisWeight;
}

public class Builder
{
public Dictionary<QuoteConvention, double> ConventionScores { get; private set; }
public QuotationMarkTabulator TabulatedQuotationMarks { get; private set; }

public Builder(QuotationMarkTabulator tabulatedQuotationMarks)
{
ConventionScores = new Dictionary<QuoteConvention, double>();
TabulatedQuotationMarks = tabulatedQuotationMarks;
}

public void RecordConventionScore(QuoteConvention quoteConvention, double score)
{
ConventionScores[quoteConvention] = score;
}

public QuoteConventionAnalysis Build()
{
return new QuoteConventionAnalysis(
ConventionScores,
TabulatedQuotationMarks,
TabulatedQuotationMarks.GetTotalQuotationMarkCount()
);
}
}

public static QuoteConventionAnalysis CombineWithWeightedAverage(
List<QuoteConventionAnalysis> quoteConventionAnalyses
)
{
double totalWeight = 0;
Dictionary<string, double> conventionVotes = new Dictionary<string, double>();
Dictionary<string, QuoteConvention> quoteConventionsByName = new Dictionary<string, QuoteConvention>();
QuotationMarkTabulator totalTabulatedQuotationMarks = new QuotationMarkTabulator();
foreach (QuoteConventionAnalysis quoteConventionAnalysis in quoteConventionAnalyses)
{
totalTabulatedQuotationMarks.TabulateFrom(quoteConventionAnalysis.TabulatedQuotationMarks);
totalWeight += quoteConventionAnalysis.AnalysisWeight;
foreach (
(QuoteConvention convention, double score) in quoteConventionAnalysis.ConventionScores.Select(kvp =>
(kvp.Key, kvp.Value)
)
)
{
quoteConventionsByName[convention.Name] = convention;
conventionVotes.UpdateValue(
convention.Name,
() => 0,
s => s + score * quoteConventionAnalysis.AnalysisWeight
);
}
}
QuoteConventionAnalysis.Builder builder = new QuoteConventionAnalysis.Builder(totalTabulatedQuotationMarks);
foreach ((string conventionName, double totalScore) in conventionVotes.Select(kvp => (kvp.Key, kvp.Value)))
{
if (totalScore > 0)
{
builder.RecordConventionScore(quoteConventionsByName[conventionName], totalScore / totalWeight);
}
}
return builder.Build();
}
}
}
32 changes: 1 addition & 31 deletions src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,6 @@

namespace SIL.Machine.PunctuationAnalysis
{
public class QuoteConventionAnalysis
{
public QuoteConvention BestQuoteConvention { get; private set; }
public double BestQuoteConventionScore { get; private set; }
public string AnalysisSummary { get; private set; }

public QuoteConventionAnalysis(
QuoteConvention bestQuoteConvention,
double bestQuoteConventionScore,
string analysisSummary
)
{
BestQuoteConvention = bestQuoteConvention;
BestQuoteConventionScore = bestQuoteConventionScore;
AnalysisSummary = analysisSummary;
}
}

public class QuoteConventionDetector : UsfmStructureExtractor
{
private readonly QuotationMarkTabulator _quotationMarkTabulator;
Expand Down Expand Up @@ -60,19 +42,7 @@ public QuoteConventionAnalysis DetectQuoteConvention(IReadOnlyDictionary<int, Li
{
CountQuotationMarksInChapters(GetChapters(includeChapters));

(QuoteConvention bestQuoteConvention, double score) = QuoteConventions.Standard.FindMostSimilarConvention(
_quotationMarkTabulator
);

if (score > 0 && bestQuoteConvention != null)
{
return new QuoteConventionAnalysis(
bestQuoteConvention,
score,
_quotationMarkTabulator.GetSummaryMessage()
);
}
return null;
return QuoteConventions.Standard.ScoreAllQuoteConventions(_quotationMarkTabulator);
}
}
}
11 changes: 11 additions & 0 deletions src/SIL.Machine/PunctuationAnalysis/QuoteConventionSet.cs
Original file line number Diff line number Diff line change
Expand Up @@ -230,5 +230,16 @@ QuotationMarkTabulator tabulatedQuotationMarks
}
return (bestQuoteConvention, bestSimilarity);
}

public QuoteConventionAnalysis ScoreAllQuoteConventions(QuotationMarkTabulator tabulatedQuotationMarks)
{
var builder = new QuoteConventionAnalysis.Builder(tabulatedQuotationMarks);
foreach (QuoteConvention convention in Conventions)
{
double score = tabulatedQuotationMarks.CalculateSimilarity(convention);
builder.RecordConventionScore(convention, score);
}
return builder.Build();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,15 @@ public static class QuoteConventions
new SingleLevelQuoteConvention("\u2019", "\u2018"),
}
),
new QuoteConvention(
"arabic_inspired_western_european",
new List<SingleLevelQuoteConvention>
{
new SingleLevelQuoteConvention("\u00ab", "\u00bb"),
new SingleLevelQuoteConvention("\u201d", "\u201c"),
new SingleLevelQuoteConvention("\u2019", "\u2018"),
}
),
}
);
}
Expand Down
Loading
Loading