NVIDIA-NeMo · yzhang123 · Nov 16, 2022 · Nov 15, 2022 · Nov 16, 2022 · Nov 16, 2022
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -126,23 +126,23 @@ pipeline {
       parallel {
         stage('En TN grammars') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
           }
         }
         stage('En ITN grammars') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
           }
         }
         stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22'
-            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
+            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
           }
         }
         stage('Test En Hybrid TN') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/hybrid/wfst_lm_rescoring.py --data /home/TestData/nlp/text_norm/hybrid_tn/test.txt --regenerate_pkl --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22 | grep "all_correct: True" || exit 1'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/hybrid/wfst_lm_rescoring.py --data /home/TestData/nlp/text_norm/hybrid_tn/test.txt --regenerate_pkl --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 | grep "all_correct: True" || exit 1'
           }
         }
       }
@@ -159,7 +159,7 @@ pipeline {
       parallel {
         stage('L2: Eng TN') {
           steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
+            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
             sh 'cd nemo_text_processing/text_normalization/ &&  python normalize.py --input_file=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose'
             sh 'cat /home/TestData/nlp/text_norm/output/test.pynini.txt'
             sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_05-25.txt || exit 1'
@@ -169,7 +169,7 @@ pipeline {
 
         stage('L2: Eng ITN export') {
           steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
+            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
             sh 'cd nemo_text_processing/inverse_text_normalization/ &&  python inverse_normalize.py --input_file=/home/TestData/nlp/text_denorm/ci/test.txt --language=en --output_file=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose'
             sh 'cmp --silent /home/TestData/nlp/text_denorm/output/test.pynini.txt /home/TestData/nlp/text_denorm/ci/test_goal_py.txt || exit 1'
             sh 'rm -rf /home/TestData/nlp/text_denorm/output/*'
@@ -178,23 +178,23 @@ pipeline {
         stage('L2: TN with Audio (audio and raw text)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22 --text "The total amounts to \\$4.76." \
+            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --text "The total amounts to \\$4.76." \
             --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_raw.txt 2>&1 && \
             cmp --silent /tmp/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
           }
         }
         stage('L2: TN with Audio (audio and text file)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
+            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
             --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_file.txt 2>&1 && \
             cmp --silent /tmp/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
           }
         }
         stage('L2: TN with Audio (manifest)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22'
+            python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
           }
         }
       }

diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py
@@ -57,6 +57,16 @@ def __init__(self):
             graph_hundred_component_at_least_one_none_zero_digit
         )
 
+        # Transducer for eleven hundred -> 1100 or twenty one hundred eleven -> 2111
+        graph_hundred_as_thousand = pynini.union(graph_teen, graph_ties + delete_space + graph_digit)
+        graph_hundred_as_thousand += delete_space + graph_hundred
+        graph_hundred_as_thousand += delete_space + pynini.union(
+            graph_teen | pynutil.insert("00"),
+            (graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")),
+        )
+
+        graph_hundreds = graph_hundred_component | graph_hundred_as_thousand
+
         graph_thousands = pynini.union(
             graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("thousand"),
             pynutil.insert("000", weight=0.1),
@@ -102,7 +112,7 @@ def __init__(self):
             + delete_space
             + graph_thousands
             + delete_space
-            + graph_hundred_component,
+            + graph_hundreds,
             graph_zero,
         )
 

diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/date.py b/nemo_text_processing/inverse_text_normalization/en/taggers/date.py
@@ -78,14 +78,15 @@ def _get_digits_graph():
     def _get_thousands_graph():
         graph_ties = _get_ties_graph()
         graph_hundred_component = (graph_digit + delete_space + pynutil.delete("hundred")) | pynutil.insert("0")
+        optional_end = pynini.closure(pynutil.delete("and "), 0, 1)
         graph = (
             graph_digit
             + delete_space
             + pynutil.delete("thousand")
             + delete_space
             + graph_hundred_component
             + delete_space
-            + (graph_teen | graph_ties)
+            + (graph_teen | graph_ties | (optional_end + pynutil.insert("0") + graph_digit))
         )
         return graph
 

diff --git a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_cardinal.txt
@@ -22,3 +22,7 @@ eighteen million four hundred fifty thousand nine hundred ninety~18450990
 eighteen million nine hundred forty thousand seven hundred twenty two~18940722
 eighteen million six hundred ninety thousand nine hundred sixteen~18690916
 eighteen thousand eight hundred eighty~18880
+eleven hundred~1100
+twenty one hundred~2100
+twenty one hundred and eleven~2111
+eleven hundred twenty one~1121
diff --git a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_money.txt
@@ -47,3 +47,6 @@ eighteen thousand one hundred twenty four dollars~$18124
 eighteen thousand one hundred twenty nine dollars~$18129
 one thousand fifty five dollars~$1055
 one fifty five dollars~$155
+fifteen hundred dollars~$1500
+ninety nine hundred dollars~$9900
+ninety nine hundred and fifteen dollars and one cent~$9915.01
diff --git a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_ordinal.txt
@@ -14,6 +14,7 @@ twenty third~23rd
 one hundred eleventh~111th
 one thousandth~1000th
 one hundred twenty first~121st
+eleven hundred twenty first~1121st
 second~2nd
 tenth~10th
 sixth~6th