diff --git a/Jenkinsfile b/Jenkinsfile index cf56c671b..7aa0ff575 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -23,7 +23,7 @@ pipeline { VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-27-23-0' - IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' + IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-26-23-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/it/taggers/cardinal.py b/nemo_text_processing/text_normalization/it/taggers/cardinal.py index 530451b99..59d3a61f9 100644 --- a/nemo_text_processing/text_normalization/it/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/it/taggers/cardinal.py @@ -84,8 +84,8 @@ def __init__(self, deterministic: bool = True): # double digit graph_tens = teen graph_tens |= tens + (pynutil.delete('0') | graph_digit) - graph_tens |= tens_one - graph_tens |= tens_eight + graph_tens |= pynutil.add_weight(tens_one, -0.01) + graph_tens |= pynutil.add_weight(tens_eight, -0.01) self.tens = graph_tens.optimize() diff --git a/tests/nemo_text_processing/it/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/it/data_text_normalization/test_cases_cardinal.txt index 127addd64..795ec896e 100644 --- a/tests/nemo_text_processing/it/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/it/data_text_normalization/test_cases_cardinal.txt @@ -8,4 +8,6 @@ 3544~tremila cinquecento quarantaquattro 1000~mille 1034500~un milione trentaquattromila cinquecento -3453243534~tremila quattrocento cinquantatre milioni duecento quarantatremila cinquecento trentaquattro \ No newline at end of file +3453243534~tremila quattrocento cinquantatre milioni duecento quarantatremila cinquecento trentaquattro +38~trentotto +7 giorni sono una settimana~sette giorni sono una settimana \ No newline at end of file diff --git a/tests/nemo_text_processing/it/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/it/data_text_normalization/test_cases_money.txt index db072f182..f800465df 100644 --- a/tests/nemo_text_processing/it/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/it/data_text_normalization/test_cases_money.txt @@ -1,4 +1,5 @@ 2,01 ₽~due rubli un copeca 3,23€~tre euro ventitre centesimi 4,2 £~quattro sterline venti penny -1 eur~un euro \ No newline at end of file +1 eur~un euro +1 eur per il caffè~un euro per il caffè \ No newline at end of file diff --git a/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt index ec9bc1a68..cc8e7667c 100644 --- a/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt @@ -1,4 +1,6 @@ -12:30~dodici e mezza -05:15~cinque e un quarto -17:15:26~diciassette e un quarto e ventisei secondi -23:45~ventitre e quarantacinque minuti \ No newline at end of file +12:30~dodici e trenta minuti~dodici e mezza +05:15~cinque e quindici minuti~cinque e un quarto +17:15:26~diciassette e quindici minuti e ventisei secondi~diciassette e un quarto e ventisei secondi +23:45~ventitre e quarantacinque minuti +03:38~tre e trentotto minuti +l'evento inizia alle 16:00~l'evento inizia alle sedici \ No newline at end of file diff --git a/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh index 44f77fe2d..c8285be97 100644 --- a/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh @@ -8,10 +8,14 @@ runtest () { # read test file while read testcase; do - IFS='~' read written spoken <<< $testcase - # replace non breaking space with breaking space - denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + IFS='~' read -a testcase_tokenized <<< $testcase + written=${testcase_tokenized[0]} + # only tests against first possible option when there are multiple shortest paths + spoken=${testcase_tokenized[1]} + # replace non breaking space with breaking space + denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + # trim white space spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" diff --git a/tests/nemo_text_processing/it/test_time.py b/tests/nemo_text_processing/it/test_time.py index c9abb76a7..840d2a9b0 100644 --- a/tests/nemo_text_processing/it/test_time.py +++ b/tests/nemo_text_processing/it/test_time.py @@ -27,4 +27,4 @@ class TestChar: @pytest.mark.unit def test_norm_char(self, test_input, expected): preds = self.normalizer.normalize(test_input, punct_post_process=True) - assert expected == preds + assert preds in expected