From ff8c3dbf738a14f1b15db8d11d915787fbdc346d Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 5 Aug 2025 17:02:37 -0400 Subject: [PATCH 1/5] Pretranslation quotation denormalization --- src/Serval/src/Serval.Client/Client.g.cs | 157 +++++++++--------- .../Contracts/CorpusAnalysisDto.cs | 8 + .../PretranslationQuotationMarkBehavior.cs | 7 + .../TranslationEnginesController.cs | 12 +- .../src/Serval.Translation/Models/Build.cs | 2 +- .../Models/CorpusAnalysis.cs | 8 + .../Services/IPretranslationService.cs | 1 + .../Services/PretranslationService.cs | 120 ++++++++++++- .../Services/PretranslationServiceTests.cs | 120 ++++++++++--- .../test/Serval.Translation.Tests/Usings.cs | 1 + 10 files changed, 313 insertions(+), 123 deletions(-) create mode 100644 src/Serval/src/Serval.Translation/Contracts/CorpusAnalysisDto.cs create mode 100644 src/Serval/src/Serval.Translation/Contracts/PretranslationQuotationMarkBehavior.cs create mode 100644 src/Serval/src/Serval.Translation/Models/CorpusAnalysis.cs diff --git a/src/Serval/src/Serval.Client/Client.g.cs b/src/Serval/src/Serval.Client/Client.g.cs index da2c01b4..f86b52b5 100644 --- a/src/Serval/src/Serval.Client/Client.g.cs +++ b/src/Serval/src/Serval.Client/Client.g.cs @@ -438,7 +438,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -450,7 +450,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -1162,7 +1162,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -1174,7 +1174,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -1220,7 +1220,7 @@ public partial interface IDataFilesClient /// /// /// Sample request: - ///
+ ///
///
POST /files ///
{ ///
"format": "text", @@ -1435,7 +1435,7 @@ public string BaseUrl /// /// /// Sample request: - ///
+ ///
///
POST /files ///
{ ///
"format": "text", @@ -2088,7 +2088,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -2100,7 +2100,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -2159,14 +2159,14 @@ public partial interface ITranslationEnginesClient ///
### nmt ///
The Neural Machine Translation engine is primarily used for pretranslations. It is fine-tuned from Meta's NLLB-200. Valid IETF language tags provided to Serval will be converted to [NLLB-200 codes](https://github.com/facebookresearch/flores/tree/main/flores200#languages-in-flores-200). See more about language tag resolution [here](https://github.com/sillsdev/serval/wiki/FLORES%E2%80%90200-Language-Code-Resolution-for-NMT-Engine). ///
* **IsModelPersisted**: (default to false) Whether the model can be downloaded by the client after it has been successfully built. - ///
+ ///
///
If you use a language among NLLB's supported languages, Serval will utilize everything the NLLB-200 model already knows about that language when translating. If the language you are working with is not among NLLB's supported languages, the language code will have no effect. - ///
+ ///
///
Typical endpoints: pretranslate ///
### echo ///
The echo engine has full coverage of all nmt and smt-transfer endpoints. Endpoints like create and build return empty responses. Endpoints like translate and get-word-graph echo the sent content back to the user in a format that mocks nmt or Smt. For example, translating a segment "test" with the echo engine would yield a translation response with translation "test". This engine is useful for debugging and testing purposes. ///
## Sample request: - ///
+ ///
///
{ ///
"name": "myTeam:myProject:myEngine", ///
"sourceLanguage": "el", @@ -2204,7 +2204,7 @@ public partial interface ITranslationEnginesClient /// /// /// ## Sample request: - ///
+ ///
///
{ ///
"sourceLanguage": "en", ///
"targetLanguage": "en" @@ -2413,7 +2413,7 @@ public partial interface ITranslationEnginesClient ///
* The references defined in the SourceFile per line, if any. ///
* An auto-generated reference of `[TextId]:[lineNumber]`, 1 indexed. ///
* **Translation**: the text of the pretranslation - ///
+ ///
///
Pretranslations can be filtered by text id if provided. ///
Only pretranslations for the most recent successful build of the engine are returned. ///
@@ -2435,7 +2435,7 @@ public partial interface ITranslationEnginesClient ///
* The references defined in the SourceFile per line, if any. ///
* An auto-generated reference of `[TextId]:[lineNumber]`, 1 indexed. ///
* **Translation**: the text of the pretranslation - ///
+ ///
///
Only pretranslations for the most recent successful build of the engine are returned. ///
/// The translation engine id @@ -2455,22 +2455,22 @@ public partial interface ITranslationEnginesClient ///
* `PreferPretranslated`: The existing and pretranslated texts are merged into the USFM, preferring pretranslated text. ///
* `OnlyExisting`: Return the existing target USFM file with no modifications (except updating the USFM id if needed). ///
* `OnlyPretranslated`: Only the pretranslated text is returned; all existing text in the target USFM is removed. - ///
+ ///
///
The source or target book can be used as the USFM template for the pretranslated text. The template can be controlled by the `template` parameter: ///
* `Auto`: The target book is used as the template if it exists; otherwise, the source book is used. **This is the default**. ///
* `Source`: The source book is used as the template. ///
* `Target`: The target book is used as the template. - ///
+ ///
///
The intra-verse USFM markers are handled in the following way: ///
* All verse and non-verse text is stripped of all intra-verse USFM to be pretranslated (if the book is chosen). ///
* Reference (\r) and remark (\rem) markers are not translated but carried through from the source to the target. ///
* Notes are stripped. - ///
+ ///
///
Preserving or stripping different types of USFM markers can be controlled by the `paragraphMarkerBehavior`, `embedBehavior`, and `styleMarkerBehavior` parameters. ///
* `PushToEnd`: The USFM markers (or the entire embed) are preserved and placed at the end of the verse. **This is the default for paragraph markers and embeds**. ///
* `TryToPlace`: The USFM markers (or the entire embed) are placed in approximately the right location within the verse. **This option is only available for paragraph markers. Quality of placement may differ from language to language.**. ///
* `Strip`: The USFM markers (or the entire embed) are removed. **This is the default for style markers**. - ///
+ ///
///
Only pretranslations for the most recent successful build of the engine are returned. ///
The USFM parsing and marker types used are defined here: [this wiki](https://github.com/sillsdev/serval/wiki/USFM-Parsing-and-Translation). ///
@@ -2484,7 +2484,7 @@ public partial interface ITranslationEnginesClient /// The behavior of style markers. /// The book in USFM format /// A server side error occurred. - System.Threading.Tasks.Task GetPretranslatedUsfmAsync(string id, string corpusId, string textId, PretranslationUsfmTextOrigin? textOrigin = null, PretranslationUsfmTemplate? template = null, PretranslationUsfmMarkerBehavior? paragraphMarkerBehavior = null, PretranslationUsfmMarkerBehavior? embedBehavior = null, PretranslationUsfmMarkerBehavior? styleMarkerBehavior = null, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)); + System.Threading.Tasks.Task GetPretranslatedUsfmAsync(string id, string corpusId, string textId, PretranslationUsfmTextOrigin? textOrigin = null, PretranslationUsfmTemplate? template = null, PretranslationUsfmMarkerBehavior? paragraphMarkerBehavior = null, PretranslationUsfmMarkerBehavior? embedBehavior = null, PretranslationUsfmMarkerBehavior? styleMarkerBehavior = null, PretranslationQuotationMarkBehavior? quotationMarkBehavior = null, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)); /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// @@ -2504,30 +2504,30 @@ public partial interface ITranslationEnginesClient ///
Specifying a corpus: ///
* A (legacy) corpus is selected by specifying CorpusId and a parallel corpus is selected by specifying ParallelCorpusId. ///
* A parallel corpus can be further filtered by specifying particular CorpusIds in SourceFilters or TargetFilters. - ///
+ ///
///
Filtering by textID or chapter: ///
* Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. ///
* Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) ///
* All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. - ///
+ ///
///
Filter - train on all or none ///
* If trainOn or pretranslate is not provided, all corpora will be used for training or pretranslation respectively ///
* If a corpus is selected for training or pretranslation and neither scriptureRange nor textIds are defined, all of the selected corpus will be used. ///
* If a corpus is selected for training or pretranslation and an empty scriptureRange or textIds is defined, none of the selected corpus will be used. ///
* If a corpus is selected for training or pretranslation but no further filters are provided, all selected corpora will be used for training or pretranslation respectively. - ///
+ ///
///
Specify the corpora and textIds/scriptureRanges within those corpora to pretranslate. When a corpus is selected for pretranslation, ///
the following text will be pretranslated: ///
* Text segments that are in the source and not the target (untranslated) ///
* Text segments that are in the source and the target, but where that target segment is not trained on. ///
If the engine does not support pretranslation, these fields have no effect. ///
Pretranslating has the same filtering as training. - ///
+ ///
///
The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object. ///
See [nmt job settings documentation](https://github.com/sillsdev/serval/wiki/NMT-Build-Options) about configuring job parameters. ///
See [smt-transfer job settings documentation](https://github.com/sillsdev/serval/wiki/SMT-Transfer-Build-Options) about configuring job parameters. ///
See [keyterms parsing documentation](https://github.com/sillsdev/serval/wiki/Paratext-Key-Terms-Parsing) on how to use keyterms for training. - ///
+ ///
///
When using a parallel corpus: ///
* If, within a single parallel corpus, multiple source corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), those sources will be mixed where they overlap by randomly choosing from each source per line/verse. ///
* If, within a single parallel corpus, multiple target corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), only the first of the targets that includes that textId/book will be used for that textId/book. @@ -2588,10 +2588,10 @@ public partial interface ITranslationEnginesClient /// /// If a Nmt build was successful and IsModelPersisted is `true` for the engine, ///
then the model from the most recent successful build can be downloaded. - ///
+ ///
///
The endpoint will return a URL that can be used to download the model for up to 1 hour ///
after the request is made. If the URL is not used within that time, a new request will need to be made. - ///
+ ///
///
The download itself is created by g-zipping together the folder containing the fine tuned model ///
with all necessary supporting files. This zipped folder is then named by the pattern: ///
* <engine_id>_<model_revision>.tar.gz @@ -2762,14 +2762,14 @@ public string BaseUrl ///
### nmt ///
The Neural Machine Translation engine is primarily used for pretranslations. It is fine-tuned from Meta's NLLB-200. Valid IETF language tags provided to Serval will be converted to [NLLB-200 codes](https://github.com/facebookresearch/flores/tree/main/flores200#languages-in-flores-200). See more about language tag resolution [here](https://github.com/sillsdev/serval/wiki/FLORES%E2%80%90200-Language-Code-Resolution-for-NMT-Engine). ///
* **IsModelPersisted**: (default to false) Whether the model can be downloaded by the client after it has been successfully built. - ///
+ ///
///
If you use a language among NLLB's supported languages, Serval will utilize everything the NLLB-200 model already knows about that language when translating. If the language you are working with is not among NLLB's supported languages, the language code will have no effect. - ///
+ ///
///
Typical endpoints: pretranslate ///
### echo ///
The echo engine has full coverage of all nmt and smt-transfer endpoints. Endpoints like create and build return empty responses. Endpoints like translate and get-word-graph echo the sent content back to the user in a format that mocks nmt or Smt. For example, translating a segment "test" with the echo engine would yield a translation response with translation "test". This engine is useful for debugging and testing purposes. ///
## Sample request: - ///
+ ///
///
{ ///
"name": "myTeam:myProject:myEngine", ///
"sourceLanguage": "el", @@ -3086,7 +3086,7 @@ public string BaseUrl ///
/// /// ## Sample request: - ///
+ ///
///
{ ///
"sourceLanguage": "en", ///
"targetLanguage": "en" @@ -4888,7 +4888,7 @@ public string BaseUrl ///
* The references defined in the SourceFile per line, if any. ///
* An auto-generated reference of `[TextId]:[lineNumber]`, 1 indexed. ///
* **Translation**: the text of the pretranslation - ///
+ ///
///
Pretranslations can be filtered by text id if provided. ///
Only pretranslations for the most recent successful build of the engine are returned. ///
@@ -5022,7 +5022,7 @@ public string BaseUrl ///
* The references defined in the SourceFile per line, if any. ///
* An auto-generated reference of `[TextId]:[lineNumber]`, 1 indexed. ///
* **Translation**: the text of the pretranslation - ///
+ ///
///
Only pretranslations for the most recent successful build of the engine are returned. /// /// The translation engine id @@ -5152,22 +5152,22 @@ public string BaseUrl ///
* `PreferPretranslated`: The existing and pretranslated texts are merged into the USFM, preferring pretranslated text. ///
* `OnlyExisting`: Return the existing target USFM file with no modifications (except updating the USFM id if needed). ///
* `OnlyPretranslated`: Only the pretranslated text is returned; all existing text in the target USFM is removed. - ///
+ ///
///
The source or target book can be used as the USFM template for the pretranslated text. The template can be controlled by the `template` parameter: ///
* `Auto`: The target book is used as the template if it exists; otherwise, the source book is used. **This is the default**. ///
* `Source`: The source book is used as the template. ///
* `Target`: The target book is used as the template. - ///
+ ///
///
The intra-verse USFM markers are handled in the following way: ///
* All verse and non-verse text is stripped of all intra-verse USFM to be pretranslated (if the book is chosen). ///
* Reference (\r) and remark (\rem) markers are not translated but carried through from the source to the target. ///
* Notes are stripped. - ///
+ ///
///
Preserving or stripping different types of USFM markers can be controlled by the `paragraphMarkerBehavior`, `embedBehavior`, and `styleMarkerBehavior` parameters. ///
* `PushToEnd`: The USFM markers (or the entire embed) are preserved and placed at the end of the verse. **This is the default for paragraph markers and embeds**. ///
* `TryToPlace`: The USFM markers (or the entire embed) are placed in approximately the right location within the verse. **This option is only available for paragraph markers. Quality of placement may differ from language to language.**. ///
* `Strip`: The USFM markers (or the entire embed) are removed. **This is the default for style markers**. - ///
+ ///
///
Only pretranslations for the most recent successful build of the engine are returned. ///
The USFM parsing and marker types used are defined here: [this wiki](https://github.com/sillsdev/serval/wiki/USFM-Parsing-and-Translation). /// @@ -5181,7 +5181,7 @@ public string BaseUrl /// The behavior of style markers. /// The book in USFM format /// A server side error occurred. - public virtual async System.Threading.Tasks.Task GetPretranslatedUsfmAsync(string id, string corpusId, string textId, PretranslationUsfmTextOrigin? textOrigin = null, PretranslationUsfmTemplate? template = null, PretranslationUsfmMarkerBehavior? paragraphMarkerBehavior = null, PretranslationUsfmMarkerBehavior? embedBehavior = null, PretranslationUsfmMarkerBehavior? styleMarkerBehavior = null, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) + public virtual async System.Threading.Tasks.Task GetPretranslatedUsfmAsync(string id, string corpusId, string textId, PretranslationUsfmTextOrigin? textOrigin = null, PretranslationUsfmTemplate? template = null, PretranslationUsfmMarkerBehavior? paragraphMarkerBehavior = null, PretranslationUsfmMarkerBehavior? embedBehavior = null, PretranslationUsfmMarkerBehavior? styleMarkerBehavior = null, PretranslationQuotationMarkBehavior? quotationMarkBehavior = null, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) { if (id == null) throw new System.ArgumentNullException("id"); @@ -5232,6 +5232,10 @@ public string BaseUrl { urlBuilder_.Append(System.Uri.EscapeDataString("style-marker-behavior")).Append('=').Append(System.Uri.EscapeDataString(ConvertToString(styleMarkerBehavior, System.Globalization.CultureInfo.InvariantCulture))).Append('&'); } + if (quotationMarkBehavior != null) + { + urlBuilder_.Append(System.Uri.EscapeDataString("quotation-mark-behavior")).Append('=').Append(System.Uri.EscapeDataString(ConvertToString(quotationMarkBehavior, System.Globalization.CultureInfo.InvariantCulture))).Append('&'); + } urlBuilder_.Length--; PrepareRequest(client_, request_, urlBuilder_); @@ -5438,30 +5442,30 @@ public string BaseUrl ///
Specifying a corpus: ///
* A (legacy) corpus is selected by specifying CorpusId and a parallel corpus is selected by specifying ParallelCorpusId. ///
* A parallel corpus can be further filtered by specifying particular CorpusIds in SourceFilters or TargetFilters. - ///
+ ///
///
Filtering by textID or chapter: ///
* Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. ///
* Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) ///
* All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. - ///
+ ///
///
Filter - train on all or none ///
* If trainOn or pretranslate is not provided, all corpora will be used for training or pretranslation respectively ///
* If a corpus is selected for training or pretranslation and neither scriptureRange nor textIds are defined, all of the selected corpus will be used. ///
* If a corpus is selected for training or pretranslation and an empty scriptureRange or textIds is defined, none of the selected corpus will be used. ///
* If a corpus is selected for training or pretranslation but no further filters are provided, all selected corpora will be used for training or pretranslation respectively. - ///
+ ///
///
Specify the corpora and textIds/scriptureRanges within those corpora to pretranslate. When a corpus is selected for pretranslation, ///
the following text will be pretranslated: ///
* Text segments that are in the source and not the target (untranslated) ///
* Text segments that are in the source and the target, but where that target segment is not trained on. ///
If the engine does not support pretranslation, these fields have no effect. ///
Pretranslating has the same filtering as training. - ///
+ ///
///
The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object. ///
See [nmt job settings documentation](https://github.com/sillsdev/serval/wiki/NMT-Build-Options) about configuring job parameters. ///
See [smt-transfer job settings documentation](https://github.com/sillsdev/serval/wiki/SMT-Transfer-Build-Options) about configuring job parameters. ///
See [keyterms parsing documentation](https://github.com/sillsdev/serval/wiki/Paratext-Key-Terms-Parsing) on how to use keyterms for training. - ///
+ ///
///
When using a parallel corpus: ///
* If, within a single parallel corpus, multiple source corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), those sources will be mixed where they overlap by randomly choosing from each source per line/verse. ///
* If, within a single parallel corpus, multiple target corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), only the first of the targets that includes that textId/book will be used for that textId/book. @@ -5974,10 +5978,10 @@ public string BaseUrl /// /// If a Nmt build was successful and IsModelPersisted is `true` for the engine, ///
then the model from the most recent successful build can be downloaded. - ///
+ ///
///
The endpoint will return a URL that can be used to download the model for up to 1 hour ///
after the request is made. If the URL is not used within that time, a new request will need to be made. - ///
+ ///
///
The download itself is created by g-zipping together the folder containing the fine tuned model ///
with all necessary supporting files. This zipped folder is then named by the pattern: ///
* <engine_id>_<model_revision>.tar.gz @@ -6160,7 +6164,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -6172,7 +6176,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -6572,7 +6576,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -6584,7 +6588,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -7129,7 +7133,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -7141,7 +7145,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -7199,7 +7203,7 @@ public partial interface IWordAlignmentEnginesClient ///
The echo-word-alignment engine has full coverage of all endpoints. Endpoints like create and build return empty responses. ///
Endpoints like align echo the sent content back to the user in the proper format. This engine is useful for debugging and testing purposes. ///
## Sample request: - ///
+ ///
///
{ ///
"name": "myTeam:myProject:myEngine", ///
"sourceLanguage": "el", @@ -7314,7 +7318,7 @@ public partial interface IWordAlignmentEnginesClient ///
* **SourceTokens**: the tokenized source segment ///
* **TargetTokens**: the tokenized target segment ///
* **Alignment**: a list of aligned word pairs with associated scores - ///
+ ///
///
Word alignments can be filtered by text id if provided. ///
Only word alignments for the most recent successful build of the engine are returned. ///
@@ -7344,10 +7348,10 @@ public partial interface IWordAlignmentEnginesClient ///
Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. ///
Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) ///
All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. - ///
+ ///
///
Specify the corpora or textIds to word align on. ///
When a corpus or textId is selected for word align on, only text segments that are in both the source and the target will be aligned. - ///
+ ///
///
The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object. ///
See [statistical alignment job settings documentation](https://github.com/sillsdev/serval/wiki/Statistical-Alignment-Build-Options) about configuring job parameters. ///
See [keyterms parsing documentation](https://github.com/sillsdev/serval/wiki/Paratext-Key-Terms-Parsing) on how to use keyterms for training. @@ -7561,7 +7565,7 @@ public string BaseUrl ///
The echo-word-alignment engine has full coverage of all endpoints. Endpoints like create and build return empty responses. ///
Endpoints like align echo the sent content back to the user in the proper format. This engine is useful for debugging and testing purposes. ///
## Sample request: - ///
+ ///
///
{ ///
"name": "myTeam:myProject:myEngine", ///
"sourceLanguage": "el", @@ -8582,7 +8586,7 @@ public string BaseUrl ///
* **SourceTokens**: the tokenized source segment ///
* **TargetTokens**: the tokenized target segment ///
* **Alignment**: a list of aligned word pairs with associated scores - ///
+ ///
///
Word alignments can be filtered by text id if provided. ///
Only word alignments for the most recent successful build of the engine are returned. /// @@ -8819,10 +8823,10 @@ public string BaseUrl ///
Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. ///
Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) ///
All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. - ///
+ ///
///
Specify the corpora or textIds to word align on. ///
When a corpus or textId is selected for word align on, only text segments that are in both the source and the target will be aligned. - ///
+ ///
///
The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object. ///
See [statistical alignment job settings documentation](https://github.com/sillsdev/serval/wiki/Statistical-Alignment-Build-Options) about configuring job parameters. ///
See [keyterms parsing documentation](https://github.com/sillsdev/serval/wiki/Paratext-Key-Terms-Parsing) on how to use keyterms for training. @@ -9400,7 +9404,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -9412,7 +9416,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -9673,7 +9677,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -9685,7 +9689,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -10341,6 +10345,18 @@ public enum PretranslationUsfmMarkerBehavior } + [System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "14.1.0.0 (NJsonSchema v11.0.2.0 (Newtonsoft.Json v13.0.0.0))")] + public enum PretranslationQuotationMarkBehavior + { + + [System.Runtime.Serialization.EnumMember(Value = @"NormalizedSourceQuotes")] + NormalizedSourceQuotes = 0, + + [System.Runtime.Serialization.EnumMember(Value = @"TargetQuotes")] + TargetQuotes = 1, + + } + [System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "14.1.0.0 (NJsonSchema v11.0.2.0 (Newtonsoft.Json v13.0.0.0))")] public partial class TranslationBuild { @@ -10525,23 +10541,6 @@ public enum PhaseStage } - [System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "14.1.0.0 (NJsonSchema v11.0.2.0 (Newtonsoft.Json v13.0.0.0))")] - public partial class ParallelCorpusAnalysis - { - [Newtonsoft.Json.JsonProperty("parallelCorpusRef", Required = Newtonsoft.Json.Required.Always)] - [System.ComponentModel.DataAnnotations.Required(AllowEmptyStrings = true)] - public string ParallelCorpusRef { get; set; } = default!; - - [Newtonsoft.Json.JsonProperty("sourceQuoteConvention", Required = Newtonsoft.Json.Required.Always)] - [System.ComponentModel.DataAnnotations.Required(AllowEmptyStrings = true)] - public string SourceQuoteConvention { get; set; } = default!; - - [Newtonsoft.Json.JsonProperty("targetQuoteConvention", Required = Newtonsoft.Json.Required.Always)] - [System.ComponentModel.DataAnnotations.Required(AllowEmptyStrings = true)] - public string TargetQuoteConvention { get; set; } = default!; - - } - [System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "14.1.0.0 (NJsonSchema v11.0.2.0 (Newtonsoft.Json v13.0.0.0))")] public partial class TranslationBuildConfig { diff --git a/src/Serval/src/Serval.Translation/Contracts/CorpusAnalysisDto.cs b/src/Serval/src/Serval.Translation/Contracts/CorpusAnalysisDto.cs new file mode 100644 index 00000000..68501314 --- /dev/null +++ b/src/Serval/src/Serval.Translation/Contracts/CorpusAnalysisDto.cs @@ -0,0 +1,8 @@ +namespace Serval.Translation.Contracts; + +public record CorpusAnalysisDto +{ + public required string CorpusRef { get; init; } + public required string SourceQuoteConvention { get; init; } + public required string TargetQuoteConvention { get; init; } +} diff --git a/src/Serval/src/Serval.Translation/Contracts/PretranslationQuotationMarkBehavior.cs b/src/Serval/src/Serval.Translation/Contracts/PretranslationQuotationMarkBehavior.cs new file mode 100644 index 00000000..5ac75943 --- /dev/null +++ b/src/Serval/src/Serval.Translation/Contracts/PretranslationQuotationMarkBehavior.cs @@ -0,0 +1,7 @@ +namespace Serval.Translation.Contracts; + +public enum PretranslationQuotationMarkBehavior +{ + NormalizedSourceQuotes, + TargetQuotes +} diff --git a/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs b/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs index 3ca47f0e..09ecb5da 100644 --- a/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs +++ b/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs @@ -941,6 +941,7 @@ public async Task GetPretranslatedUsfmAsync( [FromQuery(Name = "paragraph-marker-behavior")] PretranslationUsfmMarkerBehavior? paragraphMarkerBehavior, [FromQuery(Name = "embed-behavior")] PretranslationUsfmMarkerBehavior? embedBehavior, [FromQuery(Name = "style-marker-behavior")] PretranslationUsfmMarkerBehavior? styleMarkerBehavior, + [FromQuery(Name = "quotation-mark-behavior")] PretranslationQuotationMarkBehavior? quotationMarkBehavior, CancellationToken cancellationToken ) { @@ -961,6 +962,7 @@ CancellationToken cancellationToken paragraphMarkerBehavior ?? PretranslationUsfmMarkerBehavior.Preserve, embedBehavior ?? PretranslationUsfmMarkerBehavior.Preserve, styleMarkerBehavior ?? PretranslationUsfmMarkerBehavior.Strip, + quotationMarkBehavior ?? PretranslationQuotationMarkBehavior.NormalizedSourceQuotes, cancellationToken ); if (usfm == "") @@ -1891,16 +1893,6 @@ private static PhaseDto Map(BuildPhase source) StepCount = source.StepCount }; } - - private static ParallelCorpusAnalysisDto Map(ParallelCorpusAnalysis source) - { - return new ParallelCorpusAnalysisDto - { - ParallelCorpusRef = source.ParallelCorpusRef, - SourceQuoteConvention = source.SourceQuoteConvention, - TargetQuoteConvention = source.TargetQuoteConvention, - }; - } } #pragma warning restore CS0612 // Type or member is obsolete diff --git a/src/Serval/src/Serval.Translation/Models/Build.cs b/src/Serval/src/Serval.Translation/Models/Build.cs index e3ee1c63..53274bb2 100644 --- a/src/Serval/src/Serval.Translation/Models/Build.cs +++ b/src/Serval/src/Serval.Translation/Models/Build.cs @@ -20,5 +20,5 @@ public record Build : IInitializableEntity public bool? IsInitialized { get; set; } public DateTime? DateCreated { get; set; } public IReadOnlyList? Phases { get; init; } - public IReadOnlyCollection? Analysis { get; init; } + public IReadOnlyList? Analysis { get; init; } } diff --git a/src/Serval/src/Serval.Translation/Models/CorpusAnalysis.cs b/src/Serval/src/Serval.Translation/Models/CorpusAnalysis.cs new file mode 100644 index 00000000..a894d6be --- /dev/null +++ b/src/Serval/src/Serval.Translation/Models/CorpusAnalysis.cs @@ -0,0 +1,8 @@ +namespace Serval.Shared.Models; + +public record CorpusAnalysis +{ + public required string CorpusRef { get; init; } + public required string SourceQuoteConvention { get; init; } + public required string TargetQuoteConvention { get; init; } +} diff --git a/src/Serval/src/Serval.Translation/Services/IPretranslationService.cs b/src/Serval/src/Serval.Translation/Services/IPretranslationService.cs index af5c876d..2a7810cd 100644 --- a/src/Serval/src/Serval.Translation/Services/IPretranslationService.cs +++ b/src/Serval/src/Serval.Translation/Services/IPretranslationService.cs @@ -20,6 +20,7 @@ Task GetUsfmAsync( PretranslationUsfmMarkerBehavior paragraphMarkerBehavior, PretranslationUsfmMarkerBehavior embedBehavior, PretranslationUsfmMarkerBehavior styleMarkerBehavior, + PretranslationQuotationMarkBehavior quotationMarkBehavior, CancellationToken cancellationToken = default ); } diff --git a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs index 2952c0b8..3b005b4f 100644 --- a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs +++ b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs @@ -1,4 +1,5 @@ using SIL.Machine.Corpora; +using SIL.Machine.PunctuationAnalysis; using SIL.Machine.Translation; namespace Serval.Translation.Services; @@ -44,6 +45,7 @@ public async Task GetUsfmAsync( PretranslationUsfmMarkerBehavior paragraphMarkerBehavior, PretranslationUsfmMarkerBehavior embedBehavior, PretranslationUsfmMarkerBehavior styleMarkerBehavior, + PretranslationQuotationMarkBehavior quotationMarkBehavior, CancellationToken cancellationToken = default ) { @@ -68,6 +70,8 @@ public async Task GetUsfmAsync( styleMarkerBehavior ); + List remarks = [disclaimerRemark, markerPlacementRemark]; + CorpusFile sourceFile; CorpusFile targetFile; if (corpus is not null) @@ -147,6 +151,20 @@ PretranslationUsfmMarkerBehavior StyleBehavior if (paragraphMarkerBehavior == PretranslationUsfmMarkerBehavior.PreservePosition) updateBlockHandlers.Add(new PlaceMarkersUsfmUpdateBlockHandler()); + if (paragraphMarkerBehavior == PretranslationUsfmMarkerBehavior.PreservePosition) + { + IEnumerable alignmentInfo = pretranslations.Select( + p => new PlaceMarkersAlignmentInfo( + p.Refs, + p.SourceTokens?.ToList() ?? [], + p.TranslationTokens?.ToList() ?? [], + Map(p.Alignment) + ) + ); + updateBlockHandlers.Add(new PlaceMarkersUsfmUpdateBlockHandler(alignmentInfo)); + } + + string usfm = ""; // Update the target book if it exists if (template is PretranslationUsfmTemplate.Auto or PretranslationUsfmTemplate.Target) { @@ -162,7 +180,6 @@ PretranslationUsfmMarkerBehavior StyleBehavior ); using Shared.Services.ZipParatextProjectTextUpdater updater = _scriptureDataFileService.GetZipParatextProjectTextUpdater(targetFile.Filename); - string usfm = ""; switch (textOrigin) { case PretranslationUsfmTextOrigin.PreferExisting: @@ -222,12 +239,12 @@ PretranslationUsfmMarkerBehavior StyleBehavior ) ?? ""; break; } - // In order to support PretranslationUsfmTemplate.Auto - if (!string.IsNullOrEmpty(usfm)) - return usfm; } - if (template is PretranslationUsfmTemplate.Auto or PretranslationUsfmTemplate.Source) + if ( + string.IsNullOrEmpty(usfm) + && (template is PretranslationUsfmTemplate.Auto or PretranslationUsfmTemplate.Source) + ) { using Shared.Services.ZipParatextProjectTextUpdater updater = _scriptureDataFileService.GetZipParatextProjectTextUpdater(sourceFile.Filename); @@ -238,7 +255,8 @@ PretranslationUsfmMarkerBehavior StyleBehavior case PretranslationUsfmTextOrigin.PreferExisting: case PretranslationUsfmTextOrigin.PreferPretranslated: case PretranslationUsfmTextOrigin.OnlyPretranslated: - return updater.UpdateUsfm( + usfm = + updater.UpdateUsfm( textId, pretranslationRows.Select(Map).ToList(), fullName: targetSettings.FullName, @@ -249,8 +267,10 @@ PretranslationUsfmMarkerBehavior StyleBehavior updateBlockHandlers: updateBlockHandlers, remarks: [disclaimerRemark, markerPlacementRemark] ) ?? ""; + break; case PretranslationUsfmTextOrigin.OnlyExisting: - return updater.UpdateUsfm( + usfm = + updater.UpdateUsfm( textId, [], // don't pass the pretranslations, we only want the existing text. fullName: targetSettings.FullName, @@ -261,10 +281,94 @@ PretranslationUsfmMarkerBehavior StyleBehavior updateBlockHandlers: updateBlockHandlers, remarks: [disclaimerRemark, markerPlacementRemark] ) ?? ""; + break; + } + } + if (quotationMarkBehavior == PretranslationQuotationMarkBehavior.TargetQuotes) + { + if (build.Analysis is null) + { + throw new InvalidOperationException( + $"Unable to denormalize quotation marks: No quote convention analysis exists for build {build.Id}" + ); + } + if (!build.Analysis.Any(a => a.CorpusRef == corpusId)) + { + throw new InvalidOperationException( + $"Unable to denormalize quotation marks: No quote convention analysis exists for corpus {corpusId}" + ); } + CorpusAnalysis analysis = build.Analysis.Single(c => c.CorpusRef == corpusId); + (string denormalizedUsfm, IReadOnlyList denormalizationRemarks) = DenormalizeQuotationMarks( + usfm, + analysis + ); + usfm = denormalizedUsfm; + remarks.AddRange(denormalizationRemarks); + } + var remarkUpdater = new UpdateUsfmParserHandler(remarks: remarks); + UsfmParser.Parse(usfm, remarkUpdater); + + return remarkUpdater.GetUsfm(); + } + + private static (string Usfm, IReadOnlyList Remarks) DenormalizeQuotationMarks( + string usfm, + CorpusAnalysis analysis + ) + { + QuoteConvention sourceQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + analysis.SourceQuoteConvention + ); + if (sourceQuoteConvention is null) + { + throw new InvalidOperationException( + $"Unable to denormalize quotation marks: No such convention {analysis.SourceQuoteConvention}" + ); } + QuoteConvention targetQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + analysis.TargetQuoteConvention + ); + if (targetQuoteConvention is null) + { + throw new InvalidOperationException( + $"Unable to denormalize quotation marks: No such convention {analysis.TargetQuoteConvention}" + ); + } + QuotationMarkDenormalizationFirstPass quotationMarkDenormalizationFirstPass = + new(sourceQuoteConvention, targetQuoteConvention); + + UsfmParser.Parse(usfm, quotationMarkDenormalizationFirstPass); + List bestChapterStrategies = + quotationMarkDenormalizationFirstPass.FindBestChapterStrategies(); + + QuotationMarkDenormalizationUsfmUpdateBlockHandler quotationMarkDenormalizer = + new( + sourceQuoteConvention, + targetQuoteConvention, + new QuotationMarkUpdateSettings(chapterStrategies: bestChapterStrategies) + ); + List remarks = []; + if (bestChapterStrategies.Any(s => s != QuotationMarkUpdateStrategy.Skip)) + { + string quotationDenormalizationRemark = + "Quotation marks in the following chapters have been automatically denormalized after translation: " + + string.Join( + ", ", + bestChapterStrategies + .Select((strategy, index) => (strategy, index)) + .Where(tuple => tuple.strategy != QuotationMarkUpdateStrategy.Skip) + .Select(tuple => tuple.index + 1) + ) + + "."; + remarks.Add(quotationDenormalizationRemark); + } + + var updater = new UpdateUsfmParserHandler(updateBlockHandlers: [quotationMarkDenormalizer]); + UsfmParser.Parse(usfm, updater); - return ""; + usfm = updater.GetUsfm(); + return (usfm, remarks); } /// diff --git a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs index 1f714542..6943337a 100644 --- a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs +++ b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs @@ -37,7 +37,7 @@ public async Task GetUsfmAsync_Source_PreferExisting() \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph and embed markers were moved to the end of the verse. Style markers were removed. \c 1 -\v 1 Chapter 1, verse 1. Translated new paragraph +\v 1 Chapter 1, verse 1. ""Translated new paragraph"" \p \v 2 Chapter 1, verse 2. \v 3 @@ -64,7 +64,7 @@ public async Task GetUsfmAsync_Source_PreferPretranslated() \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph and embed markers were moved to the end of the verse. Style markers were removed. \c 1 -\v 1 Chapter 1, verse 1. Translated new paragraph +\v 1 Chapter 1, verse 1. ""Translated new paragraph"" \p \v 2 Chapter 1, verse 2. \v 3 @@ -118,7 +118,7 @@ public async Task GetUsfmAsync_Source_OnlyPretranslated() \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph and embed markers were moved to the end of the verse. Style markers were removed. \c 1 -\v 1 Chapter 1, verse 1. Translated new paragraph +\v 1 Chapter 1, verse 1. ""Translated new paragraph"" \p \v 2 Chapter 1, verse 2. \v 3 @@ -147,7 +147,7 @@ public async Task GetUsfmAsync_Source_PlaceMarkers() \rem Embed markers were moved to the end of the verse. Paragraph markers have positions preserved. Style markers were removed. \c 1 \v 1 Chapter 1, verse 1. -\p Translated new paragraph +\p ""Translated new paragraph"" \v 2 Chapter 1, verse 2. \v 3 " @@ -201,7 +201,7 @@ public async Task GetUsfmAsync_Target_PreferPretranslated() \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph and embed markers were moved to the end of the verse. Style markers were removed. \c 1 -\v 1 Chapter 1, verse 1. Translated new paragraph +\v 1 Chapter 1, verse 1. ""Translated new paragraph"" \v 2 Chapter 1, verse 2. \v 3 TRG - Chapter one, verse three. " @@ -240,7 +240,7 @@ public async Task GetUsfmAsync_Auto_TargetBookDoesNotExist() \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph and embed markers were moved to the end of the verse. Style markers were removed. \c 1 -\v 1 Chapter 1, verse 1. Translated new paragraph +\v 1 Chapter 1, verse 1. ""Translated new paragraph"" \p \v 2 Chapter 1, verse 2. \v 3 @@ -268,7 +268,7 @@ public async Task GetUsfmAsync_Auto_TargetBookExists() \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph and embed markers were moved to the end of the verse. Style markers were removed. \c 1 -\v 1 Chapter 1, verse 1. Translated new paragraph +\v 1 Chapter 1, verse 1. ""Translated new paragraph"" \v 2 Chapter 1, verse 2. \v 3 TRG - Chapter one, verse three. " @@ -319,7 +319,7 @@ public async Task GetUsfmAsync_Target_OnlyPretranslated() \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph and embed markers were moved to the end of the verse. Style markers were removed. \c 1 -\v 1 Chapter 1, verse 1. Translated new paragraph +\v 1 Chapter 1, verse 1. ""Translated new paragraph"" \v 2 Chapter 1, verse 2. \v 3 " @@ -340,11 +340,33 @@ public async Task GetUsfmAsync_Disclaimer_Remark_Shown() Assert.That(usfm, Does.Contain("rem This draft")); } + [Test] + public async Task GetUsfmAsync_DenormalizeQuotationMarks() + { + using TestEnvironment env = new(); + + string usfm = await env.GetUsfmAsync( + PretranslationUsfmTextOrigin.PreferExisting, + PretranslationUsfmTemplate.Source, + quotationMarkBehavior: PretranslationQuotationMarkBehavior.TargetQuotes + ); + Assert.That(usfm, Does.Contain("“Translated new paragraph”")); + Assert.That(Regex.Matches(usfm, @"\\rem"), Has.Count.EqualTo(3)); + + usfm = await env.GetUsfmAsync( + PretranslationUsfmTextOrigin.PreferExisting, + PretranslationUsfmTemplate.Source, + quotationMarkBehavior: PretranslationQuotationMarkBehavior.NormalizedSourceQuotes + ); + Assert.That(usfm, Does.Contain("\"Translated new paragraph\"")); + Assert.That(Regex.Matches(usfm, @"\\rem"), Has.Count.EqualTo(2)); + } + private class TestEnvironment : IDisposable { public TestEnvironment() { - Shared.Models.CorpusFile file1 = + CorpusFile file1 = new() { Id = "file1", @@ -352,7 +374,7 @@ public TestEnvironment() Format = Shared.Contracts.FileFormat.Paratext, TextId = "project1" }; - Shared.Models.CorpusFile file2 = + CorpusFile file2 = new() { Id = "file2", @@ -395,7 +417,7 @@ public TestEnvironment() new() { Id = "parallel_corpus1", - SourceCorpora = new List() + SourceCorpora = new List() { new() { @@ -404,7 +426,7 @@ public TestEnvironment() Files = [file1], } }, - TargetCorpora = new List() + TargetCorpora = new List() { new() { @@ -425,13 +447,31 @@ public TestEnvironment() { Id = "build1", EngineRef = "engine1", - DateFinished = DateTime.UnixEpoch + DateFinished = DateTime.UnixEpoch, + Analysis = + [ + new CorpusAnalysis() + { + CorpusRef = "corpus1", + SourceQuoteConvention = "standard_english", + TargetQuoteConvention = "standard_english" + } + ] }, new() { Id = "build2", EngineRef = "parallel_engine1", - DateFinished = DateTime.UnixEpoch + DateFinished = DateTime.UnixEpoch, + Analysis = + [ + new CorpusAnalysis() + { + CorpusRef = "parallel_corpus1", + SourceQuoteConvention = "standard_english", + TargetQuoteConvention = "standard_english" + } + ] } ] ); @@ -445,9 +485,22 @@ public TestEnvironment() CorpusRef = "corpus1", TextId = "MAT", Refs = ["MAT 1:1"], - Translation = "Chapter 1, verse 1. Translated new paragraph", + Translation = "Chapter 1, verse 1. \"Translated new paragraph\"", SourceTokens = ["SRC", "-", "Chapter", "one", ",", "verse", "one", ".", "new", "paragraph"], - TranslationTokens = ["Chapter", "1", ",", "verse", "1", ".", "Translated", "new", "paragraph"], + TranslationTokens = + [ + "Chapter", + "1", + ",", + "verse", + "1", + ".", + "\"", + "Translated", + "new", + "paragraph", + "\"" + ], Alignment = [ new() { SourceIndex = 2, TargetIndex = 0 }, @@ -456,9 +509,9 @@ public TestEnvironment() new() { SourceIndex = 5, TargetIndex = 3 }, new() { SourceIndex = 6, TargetIndex = 4 }, new() { SourceIndex = 7, TargetIndex = 5 }, - new() { SourceIndex = 8, TargetIndex = 6 }, new() { SourceIndex = 8, TargetIndex = 7 }, - new() { SourceIndex = 9, TargetIndex = 8 }, + new() { SourceIndex = 8, TargetIndex = 8 }, + new() { SourceIndex = 9, TargetIndex = 9 }, ] }, new() @@ -479,9 +532,22 @@ public TestEnvironment() CorpusRef = "parallel_corpus1", TextId = "MAT", Refs = ["MAT 1:1"], - Translation = "Chapter 1, verse 1. Translated new paragraph", + Translation = "Chapter 1, verse 1. \"Translated new paragraph\"", SourceTokens = ["SRC", "-", "Chapter", "one", ",", "verse", "one", ".", "new", "paragraph"], - TranslationTokens = ["Chapter", "1", ",", "verse", "1", ".", "Translated", "new", "paragraph"], + TranslationTokens = + [ + "Chapter", + "1", + ",", + "verse", + "1", + ".", + "\"", + "Translated", + "new", + "paragraph", + "\"" + ], Alignment = [ new() { SourceIndex = 2, TargetIndex = 0 }, @@ -490,9 +556,9 @@ public TestEnvironment() new() { SourceIndex = 5, TargetIndex = 3 }, new() { SourceIndex = 6, TargetIndex = 4 }, new() { SourceIndex = 7, TargetIndex = 5 }, - new() { SourceIndex = 8, TargetIndex = 6 }, new() { SourceIndex = 8, TargetIndex = 7 }, - new() { SourceIndex = 9, TargetIndex = 8 }, + new() { SourceIndex = 8, TargetIndex = 8 }, + new() { SourceIndex = 9, TargetIndex = 9 }, ] }, new() @@ -557,7 +623,9 @@ Shared.Services.ZipParatextProjectTextUpdater GetTextUpdater(string type) public async Task GetUsfmAsync( PretranslationUsfmTextOrigin textOrigin, PretranslationUsfmTemplate template, - PretranslationUsfmMarkerBehavior paragraphMarkerBehavior = PretranslationUsfmMarkerBehavior.Preserve + PretranslationUsfmMarkerBehavior paragraphMarkerBehavior = PretranslationUsfmMarkerBehavior.Preserve, + PretranslationQuotationMarkBehavior quotationMarkBehavior = + PretranslationQuotationMarkBehavior.NormalizedSourceQuotes ) { string usfm = await Service.GetUsfmAsync( @@ -569,7 +637,8 @@ public async Task GetUsfmAsync( template: template, paragraphMarkerBehavior: paragraphMarkerBehavior, embedBehavior: PretranslationUsfmMarkerBehavior.Preserve, - styleMarkerBehavior: PretranslationUsfmMarkerBehavior.Strip + styleMarkerBehavior: PretranslationUsfmMarkerBehavior.Strip, + quotationMarkBehavior: quotationMarkBehavior ); usfm = usfm.Replace("\r\n", "\n"); string parallel_usfm = await Service.GetUsfmAsync( @@ -581,7 +650,8 @@ public async Task GetUsfmAsync( template: template, paragraphMarkerBehavior: paragraphMarkerBehavior, embedBehavior: PretranslationUsfmMarkerBehavior.Preserve, - styleMarkerBehavior: PretranslationUsfmMarkerBehavior.Strip + styleMarkerBehavior: PretranslationUsfmMarkerBehavior.Strip, + quotationMarkBehavior: quotationMarkBehavior ); parallel_usfm = parallel_usfm.Replace("\r\n", "\n"); Assert.That(parallel_usfm, Is.EqualTo(usfm)); diff --git a/src/Serval/test/Serval.Translation.Tests/Usings.cs b/src/Serval/test/Serval.Translation.Tests/Usings.cs index 8b984c89..1ef6c9b3 100644 --- a/src/Serval/test/Serval.Translation.Tests/Usings.cs +++ b/src/Serval/test/Serval.Translation.Tests/Usings.cs @@ -1,4 +1,5 @@ global using System.Text; +global using System.Text.RegularExpressions; global using Grpc.Core; global using Grpc.Net.ClientFactory; global using MassTransit; From 079d579f407a6c2ece46361abbb3d926a25689cc Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 11 Aug 2025 11:18:39 -0400 Subject: [PATCH 2/5] Respond to reviewer comments --- src/Serval/src/Serval.Client/Client.g.cs | 14 ++++++------- .../PretranslationQuotationMarkBehavior.cs | 6 +++--- .../TranslationEnginesController.cs | 4 ++-- .../Services/IPretranslationService.cs | 2 +- .../Services/PretranslationService.cs | 20 +++++-------------- .../Services/PretranslationServiceTests.cs | 11 +++++----- 6 files changed, 23 insertions(+), 34 deletions(-) diff --git a/src/Serval/src/Serval.Client/Client.g.cs b/src/Serval/src/Serval.Client/Client.g.cs index f86b52b5..3483a9a2 100644 --- a/src/Serval/src/Serval.Client/Client.g.cs +++ b/src/Serval/src/Serval.Client/Client.g.cs @@ -2484,7 +2484,7 @@ public partial interface ITranslationEnginesClient /// The behavior of style markers. /// The book in USFM format /// A server side error occurred. - System.Threading.Tasks.Task GetPretranslatedUsfmAsync(string id, string corpusId, string textId, PretranslationUsfmTextOrigin? textOrigin = null, PretranslationUsfmTemplate? template = null, PretranslationUsfmMarkerBehavior? paragraphMarkerBehavior = null, PretranslationUsfmMarkerBehavior? embedBehavior = null, PretranslationUsfmMarkerBehavior? styleMarkerBehavior = null, PretranslationQuotationMarkBehavior? quotationMarkBehavior = null, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)); + System.Threading.Tasks.Task GetPretranslatedUsfmAsync(string id, string corpusId, string textId, PretranslationUsfmTextOrigin? textOrigin = null, PretranslationUsfmTemplate? template = null, PretranslationUsfmMarkerBehavior? paragraphMarkerBehavior = null, PretranslationUsfmMarkerBehavior? embedBehavior = null, PretranslationUsfmMarkerBehavior? styleMarkerBehavior = null, PretranslationNormalizationBehavior? quotationMarkBehavior = null, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)); /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// @@ -5181,7 +5181,7 @@ public string BaseUrl /// The behavior of style markers. /// The book in USFM format /// A server side error occurred. - public virtual async System.Threading.Tasks.Task GetPretranslatedUsfmAsync(string id, string corpusId, string textId, PretranslationUsfmTextOrigin? textOrigin = null, PretranslationUsfmTemplate? template = null, PretranslationUsfmMarkerBehavior? paragraphMarkerBehavior = null, PretranslationUsfmMarkerBehavior? embedBehavior = null, PretranslationUsfmMarkerBehavior? styleMarkerBehavior = null, PretranslationQuotationMarkBehavior? quotationMarkBehavior = null, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) + public virtual async System.Threading.Tasks.Task GetPretranslatedUsfmAsync(string id, string corpusId, string textId, PretranslationUsfmTextOrigin? textOrigin = null, PretranslationUsfmTemplate? template = null, PretranslationUsfmMarkerBehavior? paragraphMarkerBehavior = null, PretranslationUsfmMarkerBehavior? embedBehavior = null, PretranslationUsfmMarkerBehavior? styleMarkerBehavior = null, PretranslationNormalizationBehavior? quotationMarkBehavior = null, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) { if (id == null) throw new System.ArgumentNullException("id"); @@ -10346,14 +10346,14 @@ public enum PretranslationUsfmMarkerBehavior } [System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "14.1.0.0 (NJsonSchema v11.0.2.0 (Newtonsoft.Json v13.0.0.0))")] - public enum PretranslationQuotationMarkBehavior + public enum PretranslationNormalizationBehavior { - [System.Runtime.Serialization.EnumMember(Value = @"NormalizedSourceQuotes")] - NormalizedSourceQuotes = 0, + [System.Runtime.Serialization.EnumMember(Value = @"Normalized")] + Normalized = 0, - [System.Runtime.Serialization.EnumMember(Value = @"TargetQuotes")] - TargetQuotes = 1, + [System.Runtime.Serialization.EnumMember(Value = @"Denormalized")] + Denormalized = 1, } diff --git a/src/Serval/src/Serval.Translation/Contracts/PretranslationQuotationMarkBehavior.cs b/src/Serval/src/Serval.Translation/Contracts/PretranslationQuotationMarkBehavior.cs index 5ac75943..fefedb6a 100644 --- a/src/Serval/src/Serval.Translation/Contracts/PretranslationQuotationMarkBehavior.cs +++ b/src/Serval/src/Serval.Translation/Contracts/PretranslationQuotationMarkBehavior.cs @@ -1,7 +1,7 @@ namespace Serval.Translation.Contracts; -public enum PretranslationQuotationMarkBehavior +public enum PretranslationNormalizationBehavior { - NormalizedSourceQuotes, - TargetQuotes + Normalized, + Denormalized } diff --git a/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs b/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs index 09ecb5da..e6794b3d 100644 --- a/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs +++ b/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs @@ -941,7 +941,7 @@ public async Task GetPretranslatedUsfmAsync( [FromQuery(Name = "paragraph-marker-behavior")] PretranslationUsfmMarkerBehavior? paragraphMarkerBehavior, [FromQuery(Name = "embed-behavior")] PretranslationUsfmMarkerBehavior? embedBehavior, [FromQuery(Name = "style-marker-behavior")] PretranslationUsfmMarkerBehavior? styleMarkerBehavior, - [FromQuery(Name = "quotation-mark-behavior")] PretranslationQuotationMarkBehavior? quotationMarkBehavior, + [FromQuery(Name = "quotation-mark-behavior")] PretranslationNormalizationBehavior? quotationMarkBehavior, CancellationToken cancellationToken ) { @@ -962,7 +962,7 @@ CancellationToken cancellationToken paragraphMarkerBehavior ?? PretranslationUsfmMarkerBehavior.Preserve, embedBehavior ?? PretranslationUsfmMarkerBehavior.Preserve, styleMarkerBehavior ?? PretranslationUsfmMarkerBehavior.Strip, - quotationMarkBehavior ?? PretranslationQuotationMarkBehavior.NormalizedSourceQuotes, + quotationMarkBehavior ?? PretranslationNormalizationBehavior.Normalized, cancellationToken ); if (usfm == "") diff --git a/src/Serval/src/Serval.Translation/Services/IPretranslationService.cs b/src/Serval/src/Serval.Translation/Services/IPretranslationService.cs index 2a7810cd..ede5c9a5 100644 --- a/src/Serval/src/Serval.Translation/Services/IPretranslationService.cs +++ b/src/Serval/src/Serval.Translation/Services/IPretranslationService.cs @@ -20,7 +20,7 @@ Task GetUsfmAsync( PretranslationUsfmMarkerBehavior paragraphMarkerBehavior, PretranslationUsfmMarkerBehavior embedBehavior, PretranslationUsfmMarkerBehavior styleMarkerBehavior, - PretranslationQuotationMarkBehavior quotationMarkBehavior, + PretranslationNormalizationBehavior quoteNormalizationBehavior, CancellationToken cancellationToken = default ); } diff --git a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs index 3b005b4f..047435a9 100644 --- a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs +++ b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs @@ -45,7 +45,7 @@ public async Task GetUsfmAsync( PretranslationUsfmMarkerBehavior paragraphMarkerBehavior, PretranslationUsfmMarkerBehavior embedBehavior, PretranslationUsfmMarkerBehavior styleMarkerBehavior, - PretranslationQuotationMarkBehavior quotationMarkBehavior, + PretranslationNormalizationBehavior quoteNormalizationBehavior, CancellationToken cancellationToken = default ) { @@ -152,17 +152,7 @@ PretranslationUsfmMarkerBehavior StyleBehavior updateBlockHandlers.Add(new PlaceMarkersUsfmUpdateBlockHandler()); if (paragraphMarkerBehavior == PretranslationUsfmMarkerBehavior.PreservePosition) - { - IEnumerable alignmentInfo = pretranslations.Select( - p => new PlaceMarkersAlignmentInfo( - p.Refs, - p.SourceTokens?.ToList() ?? [], - p.TranslationTokens?.ToList() ?? [], - Map(p.Alignment) - ) - ); - updateBlockHandlers.Add(new PlaceMarkersUsfmUpdateBlockHandler(alignmentInfo)); - } + updateBlockHandlers.Add(new PlaceMarkersUsfmUpdateBlockHandler()); string usfm = ""; // Update the target book if it exists @@ -284,7 +274,7 @@ PretranslationUsfmMarkerBehavior StyleBehavior break; } } - if (quotationMarkBehavior == PretranslationQuotationMarkBehavior.TargetQuotes) + if (quoteNormalizationBehavior == PretranslationNormalizationBehavior.Denormalized) { if (build.Analysis is null) { @@ -317,7 +307,7 @@ private static (string Usfm, IReadOnlyList Remarks) DenormalizeQuotation CorpusAnalysis analysis ) { - QuoteConvention sourceQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + QuoteConvention sourceQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( analysis.SourceQuoteConvention ); if (sourceQuoteConvention is null) @@ -326,7 +316,7 @@ CorpusAnalysis analysis $"Unable to denormalize quotation marks: No such convention {analysis.SourceQuoteConvention}" ); } - QuoteConvention targetQuoteConvention = StandardQuoteConventions.QuoteConventions.GetQuoteConventionByName( + QuoteConvention targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( analysis.TargetQuoteConvention ); if (targetQuoteConvention is null) diff --git a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs index 6943337a..5fe30186 100644 --- a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs +++ b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs @@ -348,7 +348,7 @@ public async Task GetUsfmAsync_DenormalizeQuotationMarks() string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.PreferExisting, PretranslationUsfmTemplate.Source, - quotationMarkBehavior: PretranslationQuotationMarkBehavior.TargetQuotes + quotationMarkBehavior: PretranslationNormalizationBehavior.Denormalized ); Assert.That(usfm, Does.Contain("“Translated new paragraph”")); Assert.That(Regex.Matches(usfm, @"\\rem"), Has.Count.EqualTo(3)); @@ -356,7 +356,7 @@ public async Task GetUsfmAsync_DenormalizeQuotationMarks() usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.PreferExisting, PretranslationUsfmTemplate.Source, - quotationMarkBehavior: PretranslationQuotationMarkBehavior.NormalizedSourceQuotes + quotationMarkBehavior: PretranslationNormalizationBehavior.Normalized ); Assert.That(usfm, Does.Contain("\"Translated new paragraph\"")); Assert.That(Regex.Matches(usfm, @"\\rem"), Has.Count.EqualTo(2)); @@ -624,8 +624,7 @@ public async Task GetUsfmAsync( PretranslationUsfmTextOrigin textOrigin, PretranslationUsfmTemplate template, PretranslationUsfmMarkerBehavior paragraphMarkerBehavior = PretranslationUsfmMarkerBehavior.Preserve, - PretranslationQuotationMarkBehavior quotationMarkBehavior = - PretranslationQuotationMarkBehavior.NormalizedSourceQuotes + PretranslationNormalizationBehavior quotationMarkBehavior = PretranslationNormalizationBehavior.Normalized ) { string usfm = await Service.GetUsfmAsync( @@ -638,7 +637,7 @@ public async Task GetUsfmAsync( paragraphMarkerBehavior: paragraphMarkerBehavior, embedBehavior: PretranslationUsfmMarkerBehavior.Preserve, styleMarkerBehavior: PretranslationUsfmMarkerBehavior.Strip, - quotationMarkBehavior: quotationMarkBehavior + quoteNormalizationBehavior: quotationMarkBehavior ); usfm = usfm.Replace("\r\n", "\n"); string parallel_usfm = await Service.GetUsfmAsync( @@ -651,7 +650,7 @@ public async Task GetUsfmAsync( paragraphMarkerBehavior: paragraphMarkerBehavior, embedBehavior: PretranslationUsfmMarkerBehavior.Preserve, styleMarkerBehavior: PretranslationUsfmMarkerBehavior.Strip, - quotationMarkBehavior: quotationMarkBehavior + quoteNormalizationBehavior: quotationMarkBehavior ); parallel_usfm = parallel_usfm.Replace("\r\n", "\n"); Assert.That(parallel_usfm, Is.EqualTo(usfm)); From 9207449589a1ad4a1003ec5de39e91e86aa35834 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 11 Aug 2025 11:57:03 -0400 Subject: [PATCH 3/5] Add QD to Swagger documentation --- src/Serval/src/Serval.Client/Client.g.cs | 10 ++++++---- .../Controllers/TranslationEnginesController.cs | 9 +++++++-- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/Serval/src/Serval.Client/Client.g.cs b/src/Serval/src/Serval.Client/Client.g.cs index 3483a9a2..2da93af2 100644 --- a/src/Serval/src/Serval.Client/Client.g.cs +++ b/src/Serval/src/Serval.Client/Client.g.cs @@ -2482,9 +2482,10 @@ public partial interface ITranslationEnginesClient /// The behavior of paragraph markers. /// The behavior of embed markers. /// The behavior of style markers. + /// The normalization behavior of quotes. /// The book in USFM format /// A server side error occurred. - System.Threading.Tasks.Task GetPretranslatedUsfmAsync(string id, string corpusId, string textId, PretranslationUsfmTextOrigin? textOrigin = null, PretranslationUsfmTemplate? template = null, PretranslationUsfmMarkerBehavior? paragraphMarkerBehavior = null, PretranslationUsfmMarkerBehavior? embedBehavior = null, PretranslationUsfmMarkerBehavior? styleMarkerBehavior = null, PretranslationNormalizationBehavior? quotationMarkBehavior = null, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)); + System.Threading.Tasks.Task GetPretranslatedUsfmAsync(string id, string corpusId, string textId, PretranslationUsfmTextOrigin? textOrigin = null, PretranslationUsfmTemplate? template = null, PretranslationUsfmMarkerBehavior? paragraphMarkerBehavior = null, PretranslationUsfmMarkerBehavior? embedBehavior = null, PretranslationUsfmMarkerBehavior? styleMarkerBehavior = null, PretranslationNormalizationBehavior? quoteNormalizationBehavior = null, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)); /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// @@ -5179,9 +5180,10 @@ public string BaseUrl /// The behavior of paragraph markers. /// The behavior of embed markers. /// The behavior of style markers. + /// The normalization behavior of quotes. /// The book in USFM format /// A server side error occurred. - public virtual async System.Threading.Tasks.Task GetPretranslatedUsfmAsync(string id, string corpusId, string textId, PretranslationUsfmTextOrigin? textOrigin = null, PretranslationUsfmTemplate? template = null, PretranslationUsfmMarkerBehavior? paragraphMarkerBehavior = null, PretranslationUsfmMarkerBehavior? embedBehavior = null, PretranslationUsfmMarkerBehavior? styleMarkerBehavior = null, PretranslationNormalizationBehavior? quotationMarkBehavior = null, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) + public virtual async System.Threading.Tasks.Task GetPretranslatedUsfmAsync(string id, string corpusId, string textId, PretranslationUsfmTextOrigin? textOrigin = null, PretranslationUsfmTemplate? template = null, PretranslationUsfmMarkerBehavior? paragraphMarkerBehavior = null, PretranslationUsfmMarkerBehavior? embedBehavior = null, PretranslationUsfmMarkerBehavior? styleMarkerBehavior = null, PretranslationNormalizationBehavior? quoteNormalizationBehavior = null, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) { if (id == null) throw new System.ArgumentNullException("id"); @@ -5232,9 +5234,9 @@ public string BaseUrl { urlBuilder_.Append(System.Uri.EscapeDataString("style-marker-behavior")).Append('=').Append(System.Uri.EscapeDataString(ConvertToString(styleMarkerBehavior, System.Globalization.CultureInfo.InvariantCulture))).Append('&'); } - if (quotationMarkBehavior != null) + if (quoteNormalizationBehavior != null) { - urlBuilder_.Append(System.Uri.EscapeDataString("quotation-mark-behavior")).Append('=').Append(System.Uri.EscapeDataString(ConvertToString(quotationMarkBehavior, System.Globalization.CultureInfo.InvariantCulture))).Append('&'); + urlBuilder_.Append(System.Uri.EscapeDataString("quotation-mark-behavior")).Append('=').Append(System.Uri.EscapeDataString(ConvertToString(quoteNormalizationBehavior, System.Globalization.CultureInfo.InvariantCulture))).Append('&'); } urlBuilder_.Length--; diff --git a/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs b/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs index e6794b3d..5afca110 100644 --- a/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs +++ b/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs @@ -901,6 +901,10 @@ CancellationToken cancellationToken /// * `TryToPlace`: The USFM markers (or the entire embed) are placed in approximately the right location within the verse. **This option is only available for paragraph markers. Quality of placement may differ from language to language.**. /// * `Strip`: The USFM markers (or the entire embed) are removed. **This is the default for style markers**. /// + /// Quote normalization behavior is controlled by the `quoteNormalizationBehavior` parameter options: + /// * `Normalized`: The quotes in the pretranslated USFM are normalized quotes (typically straight quotes: ', ") in the style of the source data. + /// * `Denormalized`: The quotes in the pretranslated USFM are denormalized into the style of the target data. Quote denormalization may not be successful in all contexts. A remark will be added to the USFM listing the chapters that were successfully denormalized. + /// /// Only pretranslations for the most recent successful build of the engine are returned. /// The USFM parsing and marker types used are defined here: [this wiki](https://github.com/sillsdev/serval/wiki/USFM-Parsing-and-Translation). /// @@ -912,6 +916,7 @@ CancellationToken cancellationToken /// The behavior of paragraph markers. /// The behavior of embed markers. /// The behavior of style markers. + /// The normalization behavior of quotes. /// /// The book in USFM format /// The specified book does not exist in the source or target corpus. @@ -941,7 +946,7 @@ public async Task GetPretranslatedUsfmAsync( [FromQuery(Name = "paragraph-marker-behavior")] PretranslationUsfmMarkerBehavior? paragraphMarkerBehavior, [FromQuery(Name = "embed-behavior")] PretranslationUsfmMarkerBehavior? embedBehavior, [FromQuery(Name = "style-marker-behavior")] PretranslationUsfmMarkerBehavior? styleMarkerBehavior, - [FromQuery(Name = "quotation-mark-behavior")] PretranslationNormalizationBehavior? quotationMarkBehavior, + [FromQuery(Name = "quotation-mark-behavior")] PretranslationNormalizationBehavior? quoteNormalizationBehavior, CancellationToken cancellationToken ) { @@ -962,7 +967,7 @@ CancellationToken cancellationToken paragraphMarkerBehavior ?? PretranslationUsfmMarkerBehavior.Preserve, embedBehavior ?? PretranslationUsfmMarkerBehavior.Preserve, styleMarkerBehavior ?? PretranslationUsfmMarkerBehavior.Strip, - quotationMarkBehavior ?? PretranslationNormalizationBehavior.Normalized, + quoteNormalizationBehavior ?? PretranslationNormalizationBehavior.Normalized, cancellationToken ); if (usfm == "") From 640732cd69598d25360d7e90a70053f27c57c6db Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 13 Aug 2025 15:57:57 -0400 Subject: [PATCH 4/5] Address reviewer comment; update machine version --- .../Serval.Machine.Shared.csproj | 6 ++-- .../src/Serval.Shared/Serval.Shared.csproj | 2 +- .../Services/PretranslationService.cs | 32 +++++++------------ .../SIL.ServiceToolkit.csproj | 2 +- 4 files changed, 16 insertions(+), 26 deletions(-) diff --git a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj index 88a132c3..782e0bdb 100644 --- a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj +++ b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj @@ -36,9 +36,9 @@ - - - + + + diff --git a/src/Serval/src/Serval.Shared/Serval.Shared.csproj b/src/Serval/src/Serval.Shared/Serval.Shared.csproj index 74672ca8..9f6ab13d 100644 --- a/src/Serval/src/Serval.Shared/Serval.Shared.csproj +++ b/src/Serval/src/Serval.Shared/Serval.Shared.csproj @@ -19,7 +19,7 @@ - + diff --git a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs index 047435a9..517d30a4 100644 --- a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs +++ b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs @@ -183,7 +183,7 @@ PretranslationUsfmMarkerBehavior StyleBehavior embedBehavior: Map(embedBehavior), styleBehavior: Map(styleMarkerBehavior), updateBlockHandlers: updateBlockHandlers, - remarks: [disclaimerRemark, markerPlacementRemark] + remarks: remarks ) ?? ""; break; case PretranslationUsfmTextOrigin.PreferPretranslated: @@ -197,7 +197,7 @@ PretranslationUsfmMarkerBehavior StyleBehavior embedBehavior: Map(embedBehavior), styleBehavior: Map(styleMarkerBehavior), updateBlockHandlers: updateBlockHandlers, - remarks: [disclaimerRemark, markerPlacementRemark] + remarks: remarks ) ?? ""; break; case PretranslationUsfmTextOrigin.OnlyExisting: @@ -211,7 +211,7 @@ PretranslationUsfmMarkerBehavior StyleBehavior embedBehavior: Map(embedBehavior), styleBehavior: Map(styleMarkerBehavior), updateBlockHandlers: updateBlockHandlers, - remarks: [disclaimerRemark, markerPlacementRemark] + remarks: remarks ) ?? ""; break; case PretranslationUsfmTextOrigin.OnlyPretranslated: @@ -225,7 +225,7 @@ PretranslationUsfmMarkerBehavior StyleBehavior embedBehavior: Map(embedBehavior), styleBehavior: Map(styleMarkerBehavior), updateBlockHandlers: updateBlockHandlers, - remarks: [disclaimerRemark, markerPlacementRemark] + remarks: remarks ) ?? ""; break; } @@ -255,7 +255,7 @@ PretranslationUsfmMarkerBehavior StyleBehavior embedBehavior: Map(embedBehavior), styleBehavior: Map(styleMarkerBehavior), updateBlockHandlers: updateBlockHandlers, - remarks: [disclaimerRemark, markerPlacementRemark] + remarks: remarks ) ?? ""; break; case PretranslationUsfmTextOrigin.OnlyExisting: @@ -269,7 +269,7 @@ PretranslationUsfmMarkerBehavior StyleBehavior embedBehavior: Map(embedBehavior), styleBehavior: Map(styleMarkerBehavior), updateBlockHandlers: updateBlockHandlers, - remarks: [disclaimerRemark, markerPlacementRemark] + remarks: remarks ) ?? ""; break; } @@ -289,23 +289,13 @@ PretranslationUsfmMarkerBehavior StyleBehavior ); } CorpusAnalysis analysis = build.Analysis.Single(c => c.CorpusRef == corpusId); - (string denormalizedUsfm, IReadOnlyList denormalizationRemarks) = DenormalizeQuotationMarks( - usfm, - analysis - ); - usfm = denormalizedUsfm; - remarks.AddRange(denormalizationRemarks); + usfm = DenormalizeQuotationMarks(usfm, analysis); } - var remarkUpdater = new UpdateUsfmParserHandler(remarks: remarks); - UsfmParser.Parse(usfm, remarkUpdater); - return remarkUpdater.GetUsfm(); + return usfm; } - private static (string Usfm, IReadOnlyList Remarks) DenormalizeQuotationMarks( - string usfm, - CorpusAnalysis analysis - ) + private static string DenormalizeQuotationMarks(string usfm, CorpusAnalysis analysis) { QuoteConvention sourceQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( analysis.SourceQuoteConvention @@ -354,11 +344,11 @@ CorpusAnalysis analysis remarks.Add(quotationDenormalizationRemark); } - var updater = new UpdateUsfmParserHandler(updateBlockHandlers: [quotationMarkDenormalizer]); + var updater = new UpdateUsfmParserHandler(updateBlockHandlers: [quotationMarkDenormalizer], remarks: remarks); UsfmParser.Parse(usfm, updater); usfm = updater.GetUsfm(); - return (usfm, remarks); + return usfm; } /// diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj b/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj index bfef3d79..c1f24615 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj @@ -19,7 +19,7 @@ - + From 7845bbe2c9578b58270bebd68b38e017cbbaab12 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 13 Aug 2025 17:26:20 -0400 Subject: [PATCH 5/5] Fix mis-merged files --- src/Serval/src/Serval.Client/Client.g.cs | 145 ++++++++++-------- .../Contracts/CorpusAnalysisDto.cs | 8 - .../TranslationEnginesController.cs | 10 ++ .../src/Serval.Translation/Models/Build.cs | 2 +- .../Models/CorpusAnalysis.cs | 8 - .../Services/PretranslationService.cs | 6 +- .../Services/PretranslationServiceTests.cs | 8 +- 7 files changed, 103 insertions(+), 84 deletions(-) delete mode 100644 src/Serval/src/Serval.Translation/Contracts/CorpusAnalysisDto.cs delete mode 100644 src/Serval/src/Serval.Translation/Models/CorpusAnalysis.cs diff --git a/src/Serval/src/Serval.Client/Client.g.cs b/src/Serval/src/Serval.Client/Client.g.cs index 2da93af2..7bda757c 100644 --- a/src/Serval/src/Serval.Client/Client.g.cs +++ b/src/Serval/src/Serval.Client/Client.g.cs @@ -438,7 +438,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -450,7 +450,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -1162,7 +1162,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -1174,7 +1174,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -1220,7 +1220,7 @@ public partial interface IDataFilesClient /// /// /// Sample request: - ///
+ ///
///
POST /files ///
{ ///
"format": "text", @@ -1435,7 +1435,7 @@ public string BaseUrl ///
/// /// Sample request: - ///
+ ///
///
POST /files ///
{ ///
"format": "text", @@ -2088,7 +2088,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -2100,7 +2100,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -2159,14 +2159,14 @@ public partial interface ITranslationEnginesClient ///
### nmt ///
The Neural Machine Translation engine is primarily used for pretranslations. It is fine-tuned from Meta's NLLB-200. Valid IETF language tags provided to Serval will be converted to [NLLB-200 codes](https://github.com/facebookresearch/flores/tree/main/flores200#languages-in-flores-200). See more about language tag resolution [here](https://github.com/sillsdev/serval/wiki/FLORES%E2%80%90200-Language-Code-Resolution-for-NMT-Engine). ///
* **IsModelPersisted**: (default to false) Whether the model can be downloaded by the client after it has been successfully built. - ///
+ ///
///
If you use a language among NLLB's supported languages, Serval will utilize everything the NLLB-200 model already knows about that language when translating. If the language you are working with is not among NLLB's supported languages, the language code will have no effect. - ///
+ ///
///
Typical endpoints: pretranslate ///
### echo ///
The echo engine has full coverage of all nmt and smt-transfer endpoints. Endpoints like create and build return empty responses. Endpoints like translate and get-word-graph echo the sent content back to the user in a format that mocks nmt or Smt. For example, translating a segment "test" with the echo engine would yield a translation response with translation "test". This engine is useful for debugging and testing purposes. ///
## Sample request: - ///
+ ///
///
{ ///
"name": "myTeam:myProject:myEngine", ///
"sourceLanguage": "el", @@ -2204,7 +2204,7 @@ public partial interface ITranslationEnginesClient ///
/// /// ## Sample request: - ///
+ ///
///
{ ///
"sourceLanguage": "en", ///
"targetLanguage": "en" @@ -2413,7 +2413,7 @@ public partial interface ITranslationEnginesClient ///
* The references defined in the SourceFile per line, if any. ///
* An auto-generated reference of `[TextId]:[lineNumber]`, 1 indexed. ///
* **Translation**: the text of the pretranslation - ///
+ ///
///
Pretranslations can be filtered by text id if provided. ///
Only pretranslations for the most recent successful build of the engine are returned. ///
@@ -2435,7 +2435,7 @@ public partial interface ITranslationEnginesClient ///
* The references defined in the SourceFile per line, if any. ///
* An auto-generated reference of `[TextId]:[lineNumber]`, 1 indexed. ///
* **Translation**: the text of the pretranslation - ///
+ ///
///
Only pretranslations for the most recent successful build of the engine are returned. /// /// The translation engine id @@ -2455,22 +2455,26 @@ public partial interface ITranslationEnginesClient ///
* `PreferPretranslated`: The existing and pretranslated texts are merged into the USFM, preferring pretranslated text. ///
* `OnlyExisting`: Return the existing target USFM file with no modifications (except updating the USFM id if needed). ///
* `OnlyPretranslated`: Only the pretranslated text is returned; all existing text in the target USFM is removed. - ///
+ ///
///
The source or target book can be used as the USFM template for the pretranslated text. The template can be controlled by the `template` parameter: ///
* `Auto`: The target book is used as the template if it exists; otherwise, the source book is used. **This is the default**. ///
* `Source`: The source book is used as the template. ///
* `Target`: The target book is used as the template. - ///
+ ///
///
The intra-verse USFM markers are handled in the following way: ///
* All verse and non-verse text is stripped of all intra-verse USFM to be pretranslated (if the book is chosen). ///
* Reference (\r) and remark (\rem) markers are not translated but carried through from the source to the target. ///
* Notes are stripped. - ///
+ ///
///
Preserving or stripping different types of USFM markers can be controlled by the `paragraphMarkerBehavior`, `embedBehavior`, and `styleMarkerBehavior` parameters. ///
* `PushToEnd`: The USFM markers (or the entire embed) are preserved and placed at the end of the verse. **This is the default for paragraph markers and embeds**. ///
* `TryToPlace`: The USFM markers (or the entire embed) are placed in approximately the right location within the verse. **This option is only available for paragraph markers. Quality of placement may differ from language to language.**. ///
* `Strip`: The USFM markers (or the entire embed) are removed. **This is the default for style markers**. - ///
+ ///
+ ///
Quote normalization behavior is controlled by the `quoteNormalizationBehavior` parameter options: + ///
* `Normalized`: The quotes in the pretranslated USFM are normalized quotes (typically straight quotes: ', ") in the style of the source data. + ///
* `Denormalized`: The quotes in the pretranslated USFM are denormalized into the style of the target data. Quote denormalization may not be successful in all contexts. A remark will be added to the USFM listing the chapters that were successfully denormalized. + ///
///
Only pretranslations for the most recent successful build of the engine are returned. ///
The USFM parsing and marker types used are defined here: [this wiki](https://github.com/sillsdev/serval/wiki/USFM-Parsing-and-Translation). /// @@ -2505,30 +2509,30 @@ public partial interface ITranslationEnginesClient ///
Specifying a corpus: ///
* A (legacy) corpus is selected by specifying CorpusId and a parallel corpus is selected by specifying ParallelCorpusId. ///
* A parallel corpus can be further filtered by specifying particular CorpusIds in SourceFilters or TargetFilters. - ///
+ ///
///
Filtering by textID or chapter: ///
* Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. ///
* Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) ///
* All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. - ///
+ ///
///
Filter - train on all or none ///
* If trainOn or pretranslate is not provided, all corpora will be used for training or pretranslation respectively ///
* If a corpus is selected for training or pretranslation and neither scriptureRange nor textIds are defined, all of the selected corpus will be used. ///
* If a corpus is selected for training or pretranslation and an empty scriptureRange or textIds is defined, none of the selected corpus will be used. ///
* If a corpus is selected for training or pretranslation but no further filters are provided, all selected corpora will be used for training or pretranslation respectively. - ///
+ ///
///
Specify the corpora and textIds/scriptureRanges within those corpora to pretranslate. When a corpus is selected for pretranslation, ///
the following text will be pretranslated: ///
* Text segments that are in the source and not the target (untranslated) ///
* Text segments that are in the source and the target, but where that target segment is not trained on. ///
If the engine does not support pretranslation, these fields have no effect. ///
Pretranslating has the same filtering as training. - ///
+ ///
///
The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object. ///
See [nmt job settings documentation](https://github.com/sillsdev/serval/wiki/NMT-Build-Options) about configuring job parameters. ///
See [smt-transfer job settings documentation](https://github.com/sillsdev/serval/wiki/SMT-Transfer-Build-Options) about configuring job parameters. ///
See [keyterms parsing documentation](https://github.com/sillsdev/serval/wiki/Paratext-Key-Terms-Parsing) on how to use keyterms for training. - ///
+ ///
///
When using a parallel corpus: ///
* If, within a single parallel corpus, multiple source corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), those sources will be mixed where they overlap by randomly choosing from each source per line/verse. ///
* If, within a single parallel corpus, multiple target corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), only the first of the targets that includes that textId/book will be used for that textId/book. @@ -2589,10 +2593,10 @@ public partial interface ITranslationEnginesClient /// /// If a Nmt build was successful and IsModelPersisted is `true` for the engine, ///
then the model from the most recent successful build can be downloaded. - ///
+ ///
///
The endpoint will return a URL that can be used to download the model for up to 1 hour ///
after the request is made. If the URL is not used within that time, a new request will need to be made. - ///
+ ///
///
The download itself is created by g-zipping together the folder containing the fine tuned model ///
with all necessary supporting files. This zipped folder is then named by the pattern: ///
* <engine_id>_<model_revision>.tar.gz @@ -2763,14 +2767,14 @@ public string BaseUrl ///
### nmt ///
The Neural Machine Translation engine is primarily used for pretranslations. It is fine-tuned from Meta's NLLB-200. Valid IETF language tags provided to Serval will be converted to [NLLB-200 codes](https://github.com/facebookresearch/flores/tree/main/flores200#languages-in-flores-200). See more about language tag resolution [here](https://github.com/sillsdev/serval/wiki/FLORES%E2%80%90200-Language-Code-Resolution-for-NMT-Engine). ///
* **IsModelPersisted**: (default to false) Whether the model can be downloaded by the client after it has been successfully built. - ///
+ ///
///
If you use a language among NLLB's supported languages, Serval will utilize everything the NLLB-200 model already knows about that language when translating. If the language you are working with is not among NLLB's supported languages, the language code will have no effect. - ///
+ ///
///
Typical endpoints: pretranslate ///
### echo ///
The echo engine has full coverage of all nmt and smt-transfer endpoints. Endpoints like create and build return empty responses. Endpoints like translate and get-word-graph echo the sent content back to the user in a format that mocks nmt or Smt. For example, translating a segment "test" with the echo engine would yield a translation response with translation "test". This engine is useful for debugging and testing purposes. ///
## Sample request: - ///
+ ///
///
{ ///
"name": "myTeam:myProject:myEngine", ///
"sourceLanguage": "el", @@ -3087,7 +3091,7 @@ public string BaseUrl ///
/// /// ## Sample request: - ///
+ ///
///
{ ///
"sourceLanguage": "en", ///
"targetLanguage": "en" @@ -4889,7 +4893,7 @@ public string BaseUrl ///
* The references defined in the SourceFile per line, if any. ///
* An auto-generated reference of `[TextId]:[lineNumber]`, 1 indexed. ///
* **Translation**: the text of the pretranslation - ///
+ ///
///
Pretranslations can be filtered by text id if provided. ///
Only pretranslations for the most recent successful build of the engine are returned. ///
@@ -5023,7 +5027,7 @@ public string BaseUrl ///
* The references defined in the SourceFile per line, if any. ///
* An auto-generated reference of `[TextId]:[lineNumber]`, 1 indexed. ///
* **Translation**: the text of the pretranslation - ///
+ ///
///
Only pretranslations for the most recent successful build of the engine are returned. /// /// The translation engine id @@ -5153,22 +5157,26 @@ public string BaseUrl ///
* `PreferPretranslated`: The existing and pretranslated texts are merged into the USFM, preferring pretranslated text. ///
* `OnlyExisting`: Return the existing target USFM file with no modifications (except updating the USFM id if needed). ///
* `OnlyPretranslated`: Only the pretranslated text is returned; all existing text in the target USFM is removed. - ///
+ ///
///
The source or target book can be used as the USFM template for the pretranslated text. The template can be controlled by the `template` parameter: ///
* `Auto`: The target book is used as the template if it exists; otherwise, the source book is used. **This is the default**. ///
* `Source`: The source book is used as the template. ///
* `Target`: The target book is used as the template. - ///
+ ///
///
The intra-verse USFM markers are handled in the following way: ///
* All verse and non-verse text is stripped of all intra-verse USFM to be pretranslated (if the book is chosen). ///
* Reference (\r) and remark (\rem) markers are not translated but carried through from the source to the target. ///
* Notes are stripped. - ///
+ ///
///
Preserving or stripping different types of USFM markers can be controlled by the `paragraphMarkerBehavior`, `embedBehavior`, and `styleMarkerBehavior` parameters. ///
* `PushToEnd`: The USFM markers (or the entire embed) are preserved and placed at the end of the verse. **This is the default for paragraph markers and embeds**. ///
* `TryToPlace`: The USFM markers (or the entire embed) are placed in approximately the right location within the verse. **This option is only available for paragraph markers. Quality of placement may differ from language to language.**. ///
* `Strip`: The USFM markers (or the entire embed) are removed. **This is the default for style markers**. - ///
+ ///
+ ///
Quote normalization behavior is controlled by the `quoteNormalizationBehavior` parameter options: + ///
* `Normalized`: The quotes in the pretranslated USFM are normalized quotes (typically straight quotes: ', ") in the style of the source data. + ///
* `Denormalized`: The quotes in the pretranslated USFM are denormalized into the style of the target data. Quote denormalization may not be successful in all contexts. A remark will be added to the USFM listing the chapters that were successfully denormalized. + ///
///
Only pretranslations for the most recent successful build of the engine are returned. ///
The USFM parsing and marker types used are defined here: [this wiki](https://github.com/sillsdev/serval/wiki/USFM-Parsing-and-Translation). /// @@ -5444,30 +5452,30 @@ public string BaseUrl ///
Specifying a corpus: ///
* A (legacy) corpus is selected by specifying CorpusId and a parallel corpus is selected by specifying ParallelCorpusId. ///
* A parallel corpus can be further filtered by specifying particular CorpusIds in SourceFilters or TargetFilters. - ///
+ ///
///
Filtering by textID or chapter: ///
* Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. ///
* Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) ///
* All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. - ///
+ ///
///
Filter - train on all or none ///
* If trainOn or pretranslate is not provided, all corpora will be used for training or pretranslation respectively ///
* If a corpus is selected for training or pretranslation and neither scriptureRange nor textIds are defined, all of the selected corpus will be used. ///
* If a corpus is selected for training or pretranslation and an empty scriptureRange or textIds is defined, none of the selected corpus will be used. ///
* If a corpus is selected for training or pretranslation but no further filters are provided, all selected corpora will be used for training or pretranslation respectively. - ///
+ ///
///
Specify the corpora and textIds/scriptureRanges within those corpora to pretranslate. When a corpus is selected for pretranslation, ///
the following text will be pretranslated: ///
* Text segments that are in the source and not the target (untranslated) ///
* Text segments that are in the source and the target, but where that target segment is not trained on. ///
If the engine does not support pretranslation, these fields have no effect. ///
Pretranslating has the same filtering as training. - ///
+ ///
///
The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object. ///
See [nmt job settings documentation](https://github.com/sillsdev/serval/wiki/NMT-Build-Options) about configuring job parameters. ///
See [smt-transfer job settings documentation](https://github.com/sillsdev/serval/wiki/SMT-Transfer-Build-Options) about configuring job parameters. ///
See [keyterms parsing documentation](https://github.com/sillsdev/serval/wiki/Paratext-Key-Terms-Parsing) on how to use keyterms for training. - ///
+ ///
///
When using a parallel corpus: ///
* If, within a single parallel corpus, multiple source corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), those sources will be mixed where they overlap by randomly choosing from each source per line/verse. ///
* If, within a single parallel corpus, multiple target corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), only the first of the targets that includes that textId/book will be used for that textId/book. @@ -5980,10 +5988,10 @@ public string BaseUrl /// /// If a Nmt build was successful and IsModelPersisted is `true` for the engine, ///
then the model from the most recent successful build can be downloaded. - ///
+ ///
///
The endpoint will return a URL that can be used to download the model for up to 1 hour ///
after the request is made. If the URL is not used within that time, a new request will need to be made. - ///
+ ///
///
The download itself is created by g-zipping together the folder containing the fine tuned model ///
with all necessary supporting files. This zipped folder is then named by the pattern: ///
* <engine_id>_<model_revision>.tar.gz @@ -6166,7 +6174,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -6178,7 +6186,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -6578,7 +6586,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -6590,7 +6598,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -7135,7 +7143,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -7147,7 +7155,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -7205,7 +7213,7 @@ public partial interface IWordAlignmentEnginesClient ///
The echo-word-alignment engine has full coverage of all endpoints. Endpoints like create and build return empty responses. ///
Endpoints like align echo the sent content back to the user in the proper format. This engine is useful for debugging and testing purposes. ///
## Sample request: - ///
+ ///
///
{ ///
"name": "myTeam:myProject:myEngine", ///
"sourceLanguage": "el", @@ -7320,7 +7328,7 @@ public partial interface IWordAlignmentEnginesClient ///
* **SourceTokens**: the tokenized source segment ///
* **TargetTokens**: the tokenized target segment ///
* **Alignment**: a list of aligned word pairs with associated scores - ///
+ ///
///
Word alignments can be filtered by text id if provided. ///
Only word alignments for the most recent successful build of the engine are returned. ///
@@ -7350,10 +7358,10 @@ public partial interface IWordAlignmentEnginesClient ///
Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. ///
Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) ///
All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. - ///
+ ///
///
Specify the corpora or textIds to word align on. ///
When a corpus or textId is selected for word align on, only text segments that are in both the source and the target will be aligned. - ///
+ ///
///
The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object. ///
See [statistical alignment job settings documentation](https://github.com/sillsdev/serval/wiki/Statistical-Alignment-Build-Options) about configuring job parameters. ///
See [keyterms parsing documentation](https://github.com/sillsdev/serval/wiki/Paratext-Key-Terms-Parsing) on how to use keyterms for training. @@ -7567,7 +7575,7 @@ public string BaseUrl ///
The echo-word-alignment engine has full coverage of all endpoints. Endpoints like create and build return empty responses. ///
Endpoints like align echo the sent content back to the user in the proper format. This engine is useful for debugging and testing purposes. ///
## Sample request: - ///
+ ///
///
{ ///
"name": "myTeam:myProject:myEngine", ///
"sourceLanguage": "el", @@ -8588,7 +8596,7 @@ public string BaseUrl ///
* **SourceTokens**: the tokenized source segment ///
* **TargetTokens**: the tokenized target segment ///
* **Alignment**: a list of aligned word pairs with associated scores - ///
+ ///
///
Word alignments can be filtered by text id if provided. ///
Only word alignments for the most recent successful build of the engine are returned. /// @@ -8825,10 +8833,10 @@ public string BaseUrl ///
Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. ///
Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) ///
All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. - ///
+ ///
///
Specify the corpora or textIds to word align on. ///
When a corpus or textId is selected for word align on, only text segments that are in both the source and the target will be aligned. - ///
+ ///
///
The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object. ///
See [statistical alignment job settings documentation](https://github.com/sillsdev/serval/wiki/Statistical-Alignment-Build-Options) about configuring job parameters. ///
See [keyterms parsing documentation](https://github.com/sillsdev/serval/wiki/Paratext-Key-Terms-Parsing) on how to use keyterms for training. @@ -9406,7 +9414,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -9418,7 +9426,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -9679,7 +9687,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -9691,7 +9699,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -10543,6 +10551,23 @@ public enum PhaseStage } + [System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "14.1.0.0 (NJsonSchema v11.0.2.0 (Newtonsoft.Json v13.0.0.0))")] + public partial class ParallelCorpusAnalysis + { + [Newtonsoft.Json.JsonProperty("parallelCorpusRef", Required = Newtonsoft.Json.Required.Always)] + [System.ComponentModel.DataAnnotations.Required(AllowEmptyStrings = true)] + public string ParallelCorpusRef { get; set; } = default!; + + [Newtonsoft.Json.JsonProperty("sourceQuoteConvention", Required = Newtonsoft.Json.Required.Always)] + [System.ComponentModel.DataAnnotations.Required(AllowEmptyStrings = true)] + public string SourceQuoteConvention { get; set; } = default!; + + [Newtonsoft.Json.JsonProperty("targetQuoteConvention", Required = Newtonsoft.Json.Required.Always)] + [System.ComponentModel.DataAnnotations.Required(AllowEmptyStrings = true)] + public string TargetQuoteConvention { get; set; } = default!; + + } + [System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "14.1.0.0 (NJsonSchema v11.0.2.0 (Newtonsoft.Json v13.0.0.0))")] public partial class TranslationBuildConfig { diff --git a/src/Serval/src/Serval.Translation/Contracts/CorpusAnalysisDto.cs b/src/Serval/src/Serval.Translation/Contracts/CorpusAnalysisDto.cs deleted file mode 100644 index 68501314..00000000 --- a/src/Serval/src/Serval.Translation/Contracts/CorpusAnalysisDto.cs +++ /dev/null @@ -1,8 +0,0 @@ -namespace Serval.Translation.Contracts; - -public record CorpusAnalysisDto -{ - public required string CorpusRef { get; init; } - public required string SourceQuoteConvention { get; init; } - public required string TargetQuoteConvention { get; init; } -} diff --git a/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs b/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs index 5afca110..ab1027f1 100644 --- a/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs +++ b/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs @@ -1898,6 +1898,16 @@ private static PhaseDto Map(BuildPhase source) StepCount = source.StepCount }; } + + private static ParallelCorpusAnalysisDto Map(ParallelCorpusAnalysis source) + { + return new ParallelCorpusAnalysisDto + { + ParallelCorpusRef = source.ParallelCorpusRef, + SourceQuoteConvention = source.SourceQuoteConvention, + TargetQuoteConvention = source.TargetQuoteConvention, + }; + } } #pragma warning restore CS0612 // Type or member is obsolete diff --git a/src/Serval/src/Serval.Translation/Models/Build.cs b/src/Serval/src/Serval.Translation/Models/Build.cs index 53274bb2..ecfefe02 100644 --- a/src/Serval/src/Serval.Translation/Models/Build.cs +++ b/src/Serval/src/Serval.Translation/Models/Build.cs @@ -20,5 +20,5 @@ public record Build : IInitializableEntity public bool? IsInitialized { get; set; } public DateTime? DateCreated { get; set; } public IReadOnlyList? Phases { get; init; } - public IReadOnlyList? Analysis { get; init; } + public IReadOnlyList? Analysis { get; init; } } diff --git a/src/Serval/src/Serval.Translation/Models/CorpusAnalysis.cs b/src/Serval/src/Serval.Translation/Models/CorpusAnalysis.cs deleted file mode 100644 index a894d6be..00000000 --- a/src/Serval/src/Serval.Translation/Models/CorpusAnalysis.cs +++ /dev/null @@ -1,8 +0,0 @@ -namespace Serval.Shared.Models; - -public record CorpusAnalysis -{ - public required string CorpusRef { get; init; } - public required string SourceQuoteConvention { get; init; } - public required string TargetQuoteConvention { get; init; } -} diff --git a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs index 517d30a4..32f51dbb 100644 --- a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs +++ b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs @@ -282,20 +282,20 @@ PretranslationUsfmMarkerBehavior StyleBehavior $"Unable to denormalize quotation marks: No quote convention analysis exists for build {build.Id}" ); } - if (!build.Analysis.Any(a => a.CorpusRef == corpusId)) + if (!build.Analysis.Any(a => a.ParallelCorpusRef == corpusId)) { throw new InvalidOperationException( $"Unable to denormalize quotation marks: No quote convention analysis exists for corpus {corpusId}" ); } - CorpusAnalysis analysis = build.Analysis.Single(c => c.CorpusRef == corpusId); + ParallelCorpusAnalysis analysis = build.Analysis.Single(c => c.ParallelCorpusRef == corpusId); usfm = DenormalizeQuotationMarks(usfm, analysis); } return usfm; } - private static string DenormalizeQuotationMarks(string usfm, CorpusAnalysis analysis) + private static string DenormalizeQuotationMarks(string usfm, ParallelCorpusAnalysis analysis) { QuoteConvention sourceQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName( analysis.SourceQuoteConvention diff --git a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs index 5fe30186..a923fc7e 100644 --- a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs +++ b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs @@ -450,9 +450,9 @@ public TestEnvironment() DateFinished = DateTime.UnixEpoch, Analysis = [ - new CorpusAnalysis() + new ParallelCorpusAnalysis() { - CorpusRef = "corpus1", + ParallelCorpusRef = "corpus1", SourceQuoteConvention = "standard_english", TargetQuoteConvention = "standard_english" } @@ -465,9 +465,9 @@ public TestEnvironment() DateFinished = DateTime.UnixEpoch, Analysis = [ - new CorpusAnalysis() + new ParallelCorpusAnalysis() { - CorpusRef = "parallel_corpus1", + ParallelCorpusRef = "parallel_corpus1", SourceQuoteConvention = "standard_english", TargetQuoteConvention = "standard_english" }