From 9caf097d8155dda5f74f5ea86d57dd168fba52c6 Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Wed, 25 Oct 2023 11:45:56 -0700 Subject: [PATCH 1/6] add missing test cases Signed-off-by: Mariana Graterol Fuenmayor --- .../it/data_text_normalization/test_cases_cardinal.txt | 3 ++- .../it/data_text_normalization/test_cases_time.txt | 9 +++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/nemo_text_processing/it/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/it/data_text_normalization/test_cases_cardinal.txt index 127addd64..781ba7a8c 100644 --- a/tests/nemo_text_processing/it/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/it/data_text_normalization/test_cases_cardinal.txt @@ -8,4 +8,5 @@ 3544~tremila cinquecento quarantaquattro 1000~mille 1034500~un milione trentaquattromila cinquecento -3453243534~tremila quattrocento cinquantatre milioni duecento quarantatremila cinquecento trentaquattro \ No newline at end of file +3453243534~tremila quattrocento cinquantatre milioni duecento quarantatremila cinquecento trentaquattro +38~trentotto \ No newline at end of file diff --git a/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt index ec9bc1a68..077fa6448 100644 --- a/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt @@ -1,4 +1,5 @@ -12:30~dodici e mezza -05:15~cinque e un quarto -17:15:26~diciassette e un quarto e ventisei secondi -23:45~ventitre e quarantacinque minuti \ No newline at end of file +12:30~dodici e trenta minuti~dodici e mezza +05:15~cinque e quindici minuti~cinque e un quarto +17:15:26~diciassette e quindici minuti e ventisei secondi~diciassette e un quarto e ventisei secondi +23:45~ventitre e quarantacinque minuti +03:38~tre e trentotto minuti \ No newline at end of file From ae9ca507ec9a01b48469ef1f707ecb25fa24bb74 Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Wed, 25 Oct 2023 11:46:22 -0700 Subject: [PATCH 2/6] fix bug with time tests Signed-off-by: Mariana Graterol Fuenmayor --- .../it/test_sparrowhawk_normalization.sh | 10 +++++++--- tests/nemo_text_processing/it/test_time.py | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh index 44f77fe2d..c8285be97 100644 --- a/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh @@ -8,10 +8,14 @@ runtest () { # read test file while read testcase; do - IFS='~' read written spoken <<< $testcase - # replace non breaking space with breaking space - denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + IFS='~' read -a testcase_tokenized <<< $testcase + written=${testcase_tokenized[0]} + # only tests against first possible option when there are multiple shortest paths + spoken=${testcase_tokenized[1]} + # replace non breaking space with breaking space + denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + # trim white space spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" diff --git a/tests/nemo_text_processing/it/test_time.py b/tests/nemo_text_processing/it/test_time.py index c9abb76a7..840d2a9b0 100644 --- a/tests/nemo_text_processing/it/test_time.py +++ b/tests/nemo_text_processing/it/test_time.py @@ -27,4 +27,4 @@ class TestChar: @pytest.mark.unit def test_norm_char(self, test_input, expected): preds = self.normalizer.normalize(test_input, punct_post_process=True) - assert expected == preds + assert preds in expected From 483d3617abbdbf8e3d1fd8e86da8b02afb4769ce Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Wed, 25 Oct 2023 11:46:50 -0700 Subject: [PATCH 3/6] update ci date Signed-off-by: Mariana Graterol Fuenmayor --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index cf56c671b..9f3ae4ca6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -23,7 +23,7 @@ pipeline { VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-27-23-0' - IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' + IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-25-23-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { From 681c967b06c665500d3419bbb00f87fa43776a22 Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Thu, 26 Oct 2023 10:20:28 -0700 Subject: [PATCH 4/6] add sentence test cases Signed-off-by: Mariana Graterol Fuenmayor --- .../it/data_text_normalization/test_cases_cardinal.txt | 3 ++- .../it/data_text_normalization/test_cases_money.txt | 3 ++- .../it/data_text_normalization/test_cases_time.txt | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/nemo_text_processing/it/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/it/data_text_normalization/test_cases_cardinal.txt index 781ba7a8c..795ec896e 100644 --- a/tests/nemo_text_processing/it/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/it/data_text_normalization/test_cases_cardinal.txt @@ -9,4 +9,5 @@ 1000~mille 1034500~un milione trentaquattromila cinquecento 3453243534~tremila quattrocento cinquantatre milioni duecento quarantatremila cinquecento trentaquattro -38~trentotto \ No newline at end of file +38~trentotto +7 giorni sono una settimana~sette giorni sono una settimana \ No newline at end of file diff --git a/tests/nemo_text_processing/it/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/it/data_text_normalization/test_cases_money.txt index db072f182..f800465df 100644 --- a/tests/nemo_text_processing/it/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/it/data_text_normalization/test_cases_money.txt @@ -1,4 +1,5 @@ 2,01 ₽~due rubli un copeca 3,23€~tre euro ventitre centesimi 4,2 £~quattro sterline venti penny -1 eur~un euro \ No newline at end of file +1 eur~un euro +1 eur per il caffè~un euro per il caffè \ No newline at end of file diff --git a/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt index 077fa6448..cc8e7667c 100644 --- a/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt @@ -2,4 +2,5 @@ 05:15~cinque e quindici minuti~cinque e un quarto 17:15:26~diciassette e quindici minuti e ventisei secondi~diciassette e un quarto e ventisei secondi 23:45~ventitre e quarantacinque minuti -03:38~tre e trentotto minuti \ No newline at end of file +03:38~tre e trentotto minuti +l'evento inizia alle 16:00~l'evento inizia alle sedici \ No newline at end of file From 1dfad9af8fe922308e8d28fff2be273c0ba048db Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Thu, 26 Oct 2023 10:21:00 -0700 Subject: [PATCH 5/6] refine shortest path for irregular cardinals Signed-off-by: Mariana Graterol Fuenmayor --- .../text_normalization/it/taggers/cardinal.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/text_normalization/it/taggers/cardinal.py b/nemo_text_processing/text_normalization/it/taggers/cardinal.py index 530451b99..59d3a61f9 100644 --- a/nemo_text_processing/text_normalization/it/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/it/taggers/cardinal.py @@ -84,8 +84,8 @@ def __init__(self, deterministic: bool = True): # double digit graph_tens = teen graph_tens |= tens + (pynutil.delete('0') | graph_digit) - graph_tens |= tens_one - graph_tens |= tens_eight + graph_tens |= pynutil.add_weight(tens_one, -0.01) + graph_tens |= pynutil.add_weight(tens_eight, -0.01) self.tens = graph_tens.optimize() From 62246ae69a27b58ecba46995c817368558b19351 Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Thu, 26 Oct 2023 10:23:02 -0700 Subject: [PATCH 6/6] update ci date Signed-off-by: Mariana Graterol Fuenmayor --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 9f3ae4ca6..7aa0ff575 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -23,7 +23,7 @@ pipeline { VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-27-23-0' - IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-25-23-0' + IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-26-23-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages {