diff --git a/Jenkinsfile b/Jenkinsfile index b6eeb7c53ade..81b34129417d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -126,23 +126,23 @@ pipeline { parallel { stage('En TN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22' } } stage('En ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22' } } stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22' - sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22' + sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22' } } stage('Test En Hybrid TN') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/hybrid/wfst_lm_rescoring.py --data /home/TestData/nlp/text_norm/hybrid_tn/test.txt --regenerate_pkl --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22 | grep "all_correct: True" || exit 1' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/hybrid/wfst_lm_rescoring.py --data /home/TestData/nlp/text_norm/hybrid_tn/test.txt --regenerate_pkl --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 | grep "all_correct: True" || exit 1' } } } @@ -159,7 +159,7 @@ pipeline { parallel { stage('L2: Eng TN') { steps { - sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1' + sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1' sh 'cd nemo_text_processing/text_normalization/ && python normalize.py --input_file=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose' sh 'cat /home/TestData/nlp/text_norm/output/test.pynini.txt' sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_05-25.txt || exit 1' @@ -169,7 +169,7 @@ pipeline { stage('L2: Eng ITN export') { steps { - sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1' + sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1' sh 'cd nemo_text_processing/inverse_text_normalization/ && python inverse_normalize.py --input_file=/home/TestData/nlp/text_denorm/ci/test.txt --language=en --output_file=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose' sh 'cmp --silent /home/TestData/nlp/text_denorm/output/test.pynini.txt /home/TestData/nlp/text_denorm/ci/test_goal_py.txt || exit 1' sh 'rm -rf /home/TestData/nlp/text_denorm/output/*' @@ -178,7 +178,7 @@ pipeline { stage('L2: TN with Audio (audio and raw text)') { steps { sh 'cd nemo_text_processing/text_normalization && \ - python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22 --text "The total amounts to \\$4.76." \ + python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --text "The total amounts to \\$4.76." \ --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_raw.txt 2>&1 && \ cmp --silent /tmp/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1' } @@ -186,7 +186,7 @@ pipeline { stage('L2: TN with Audio (audio and text file)') { steps { sh 'cd nemo_text_processing/text_normalization && \ - python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22 --text /home/TestData/nlp/text_norm/audio_based/text.txt \ + python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --text /home/TestData/nlp/text_norm/audio_based/text.txt \ --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_file.txt 2>&1 && \ cmp --silent /tmp/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1' } @@ -194,7 +194,7 @@ pipeline { stage('L2: TN with Audio (manifest)') { steps { sh 'cd nemo_text_processing/text_normalization && \ - python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22' + python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22' } } } diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py index e33d82b07160..ac72a2bf0e9a 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py @@ -57,6 +57,16 @@ def __init__(self): graph_hundred_component_at_least_one_none_zero_digit ) + # Transducer for eleven hundred -> 1100 or twenty one hundred eleven -> 2111 + graph_hundred_as_thousand = pynini.union(graph_teen, graph_ties + delete_space + graph_digit) + graph_hundred_as_thousand += delete_space + graph_hundred + graph_hundred_as_thousand += delete_space + pynini.union( + graph_teen | pynutil.insert("00"), + (graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")), + ) + + graph_hundreds = graph_hundred_component | graph_hundred_as_thousand + graph_thousands = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("thousand"), pynutil.insert("000", weight=0.1), @@ -102,7 +112,7 @@ def __init__(self): + delete_space + graph_thousands + delete_space - + graph_hundred_component, + + graph_hundreds, graph_zero, ) diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/date.py b/nemo_text_processing/inverse_text_normalization/en/taggers/date.py index 3ae4b7b5275a..43fe65f3e2e8 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/date.py @@ -78,6 +78,7 @@ def _get_digits_graph(): def _get_thousands_graph(): graph_ties = _get_ties_graph() graph_hundred_component = (graph_digit + delete_space + pynutil.delete("hundred")) | pynutil.insert("0") + optional_end = pynini.closure(pynutil.delete("and "), 0, 1) graph = ( graph_digit + delete_space @@ -85,7 +86,7 @@ def _get_thousands_graph(): + delete_space + graph_hundred_component + delete_space - + (graph_teen | graph_ties) + + (graph_teen | graph_ties | (optional_end + pynutil.insert("0") + graph_digit)) ) return graph diff --git a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_cardinal.txt index fb668c03aceb..d865e7eafa67 100644 --- a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_cardinal.txt @@ -22,3 +22,7 @@ eighteen million four hundred fifty thousand nine hundred ninety~18450990 eighteen million nine hundred forty thousand seven hundred twenty two~18940722 eighteen million six hundred ninety thousand nine hundred sixteen~18690916 eighteen thousand eight hundred eighty~18880 +eleven hundred~1100 +twenty one hundred~2100 +twenty one hundred and eleven~2111 +eleven hundred twenty one~1121 diff --git a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_money.txt index e6b1802e66e8..3fbb895f96ec 100644 --- a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_money.txt @@ -47,3 +47,6 @@ eighteen thousand one hundred twenty four dollars~$18124 eighteen thousand one hundred twenty nine dollars~$18129 one thousand fifty five dollars~$1055 one fifty five dollars~$155 +fifteen hundred dollars~$1500 +ninety nine hundred dollars~$9900 +ninety nine hundred and fifteen dollars and one cent~$9915.01 diff --git a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_ordinal.txt index 1ceaaf0acfde..f701a68bb798 100644 --- a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_ordinal.txt +++ b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_ordinal.txt @@ -14,6 +14,7 @@ twenty third~23rd one hundred eleventh~111th one thousandth~1000th one hundred twenty first~121st +eleven hundred twenty first~1121st second~2nd tenth~10th sixth~6th