From f88004a60df8ae3b88c9555638915aa6e1f7351e Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 29 Sep 2025 13:30:01 -0400 Subject: [PATCH 1/2] Copy silnlp gloss-cleaning functions exactly; separate rendering and gloss functions. --- .../Corpora/ParatextProjectTermsParserBase.cs | 36 ++++++++++++------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs index b58ce3c6..4d703dea 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs @@ -135,9 +135,9 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti .Select(kvp => { string id = kvp.Item1.Replace("\n", " "); - string gloss = kvp.Item2.Element("Renderings").Value; - IReadOnlyList glosses = GetGlosses(gloss); - return (id, glosses); + string rendering = kvp.Item2.Element("Renderings").Value; + IReadOnlyList renderings = GetRenderings(rendering); + return (id, renderings); }) .GroupBy(kvp => kvp.Item1, kvp => kvp.Item2) //Handle duplicate term ids (which do exist) e.g. שִׁלֵּמִי .Select(grouping => (grouping.Key, grouping.SelectMany(g => g))) @@ -202,27 +202,39 @@ IDictionary> termIdToReferences ); } + private static string CleanTerm(string term) + { + term = term.Trim(); + term = StripParens(term); + term = string.Join(" ", term.Split()); + return term; + } + public static IReadOnlyList GetGlosses(string gloss) { //If entire term rendering is surrounded in square brackets, remove them Match match = ContentInBracketsRegex.Match(gloss); if (match.Success) - gloss = match.Groups[0].Value; + gloss = match.Groups[1].Value; gloss = gloss.Replace("?", ""); - gloss = gloss.Replace("*", ""); - gloss = gloss.Replace("/", " "); - gloss = gloss.Trim(); - gloss = StripParens(gloss); + gloss = CleanTerm(gloss); gloss = StripParens(gloss, left: '[', right: ']'); gloss = gloss.Trim(); foreach (Match m in NumericalInformationRegex.Matches(gloss)) { gloss.Replace(m.Value, ""); } - IEnumerable glosses = Regex.Split(gloss, @"\|\|"); - glosses = glosses.SelectMany(g => g.Split(new char[] { ',', ';' })); - glosses = glosses.Select(g => g.Trim()).Where(s => s != "").Distinct().ToList(); - return (IReadOnlyList)glosses; + return Regex.Split(gloss, @"[;,/]").Select(g => g.Trim()).Where(s => s != "").Distinct().ToList(); + } + + public static IReadOnlyList GetRenderings(string rendering) + { + return Regex + .Split(rendering.Trim(), @"\|\|") + .Select(r => CleanTerm(r)) + .Select(r => r.Replace("*", "")) + .Where(r => r != "") + .ToList(); } /// From b5a6bdfbbf40fbead17558f10360a29a1ee390aa Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 29 Sep 2025 16:38:52 -0400 Subject: [PATCH 2/2] Separate terms and glosses tests --- .../Corpora/ParatextProjectTermsParserBase.cs | 2 +- .../Corpora/ParatextProjectTermsParserTests.cs | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs index 4d703dea..38e3904e 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs @@ -231,7 +231,7 @@ public static IReadOnlyList GetRenderings(string rendering) { return Regex .Split(rendering.Trim(), @"\|\|") - .Select(r => CleanTerm(r)) + .Select(r => CleanTerm(r).Trim()) .Select(r => r.Replace("*", "")) .Where(r => r != "") .ToList(); diff --git a/tests/SIL.Machine.Tests/Corpora/ParatextProjectTermsParserTests.cs b/tests/SIL.Machine.Tests/Corpora/ParatextProjectTermsParserTests.cs index 54034b9e..1e3fb736 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParatextProjectTermsParserTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParatextProjectTermsParserTests.cs @@ -159,10 +159,6 @@ public void TestStripParens(string testString, string expectedOutput, char left [Test] [TestCase("", new string[] { })] - [TestCase("*Abba* /", new string[] { "Abba" })] - [TestCase("Abba|| ", new string[] { "Abba" })] - [TestCase("Abba||Abbah?", new string[] { "Abba", "Abbah" })] - [TestCase("Abba (note)", new string[] { "Abba" })] [TestCase("Abba (note)", new string[] { "Abba" })] [TestCase("Ahasuerus, Xerxes; Assuerus", new string[] { "Ahasuerus", "Xerxes", "Assuerus" })] public void TestGetGlosses(string glossString, IReadOnlyList expectedOutput) @@ -170,6 +166,17 @@ public void TestGetGlosses(string glossString, IReadOnlyList expectedOut Assert.That(ParatextProjectTermsParserBase.GetGlosses(glossString), Is.EqualTo(expectedOutput)); } + [Test] + [TestCase("", new string[] { })] + [TestCase("*Abba*", new string[] { "Abba" })] + [TestCase("Abba|| ", new string[] { "Abba" })] + [TestCase("Abba||Abbah", new string[] { "Abba", "Abbah" })] + [TestCase("Abba (note)", new string[] { "Abba" })] + public void TestGetRenderings(string renderingString, IReadOnlyList expectedOutput) + { + Assert.That(ParatextProjectTermsParserBase.GetRenderings(renderingString), Is.EqualTo(expectedOutput)); + } + private class TestEnvironment( ParatextProjectSettings? settings = null, Dictionary? files = null,