From f3555a8ba1cec37b9a534f70a08ba3235ce856e4 Mon Sep 17 00:00:00 2001 From: Mariana <47233618+mgrafu@users.noreply.github.com> Date: Thu, 26 Oct 2023 15:45:30 -0400 Subject: [PATCH 01/90] IT TN improvement on tests (#120) * add missing test cases Signed-off-by: Mariana Graterol Fuenmayor * fix bug with time tests Signed-off-by: Mariana Graterol Fuenmayor * update ci date Signed-off-by: Mariana Graterol Fuenmayor * add sentence test cases Signed-off-by: Mariana Graterol Fuenmayor * refine shortest path for irregular cardinals Signed-off-by: Mariana Graterol Fuenmayor * update ci date Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- .../text_normalization/it/taggers/cardinal.py | 4 ++-- .../it/data_text_normalization/test_cases_cardinal.txt | 4 +++- .../it/data_text_normalization/test_cases_money.txt | 3 ++- .../it/data_text_normalization/test_cases_time.txt | 10 ++++++---- .../it/test_sparrowhawk_normalization.sh | 10 +++++++--- tests/nemo_text_processing/it/test_time.py | 2 +- 7 files changed, 22 insertions(+), 13 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index cf56c671b..7aa0ff575 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -23,7 +23,7 @@ pipeline { VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-27-23-0' - IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' + IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-26-23-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/it/taggers/cardinal.py b/nemo_text_processing/text_normalization/it/taggers/cardinal.py index 530451b99..59d3a61f9 100644 --- a/nemo_text_processing/text_normalization/it/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/it/taggers/cardinal.py @@ -84,8 +84,8 @@ def __init__(self, deterministic: bool = True): # double digit graph_tens = teen graph_tens |= tens + (pynutil.delete('0') | graph_digit) - graph_tens |= tens_one - graph_tens |= tens_eight + graph_tens |= pynutil.add_weight(tens_one, -0.01) + graph_tens |= pynutil.add_weight(tens_eight, -0.01) self.tens = graph_tens.optimize() diff --git a/tests/nemo_text_processing/it/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/it/data_text_normalization/test_cases_cardinal.txt index 127addd64..795ec896e 100644 --- a/tests/nemo_text_processing/it/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/it/data_text_normalization/test_cases_cardinal.txt @@ -8,4 +8,6 @@ 3544~tremila cinquecento quarantaquattro 1000~mille 1034500~un milione trentaquattromila cinquecento -3453243534~tremila quattrocento cinquantatre milioni duecento quarantatremila cinquecento trentaquattro \ No newline at end of file +3453243534~tremila quattrocento cinquantatre milioni duecento quarantatremila cinquecento trentaquattro +38~trentotto +7 giorni sono una settimana~sette giorni sono una settimana \ No newline at end of file diff --git a/tests/nemo_text_processing/it/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/it/data_text_normalization/test_cases_money.txt index db072f182..f800465df 100644 --- a/tests/nemo_text_processing/it/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/it/data_text_normalization/test_cases_money.txt @@ -1,4 +1,5 @@ 2,01 ₽~due rubli un copeca 3,23€~tre euro ventitre centesimi 4,2 £~quattro sterline venti penny -1 eur~un euro \ No newline at end of file +1 eur~un euro +1 eur per il caffè~un euro per il caffè \ No newline at end of file diff --git a/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt index ec9bc1a68..cc8e7667c 100644 --- a/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt @@ -1,4 +1,6 @@ -12:30~dodici e mezza -05:15~cinque e un quarto -17:15:26~diciassette e un quarto e ventisei secondi -23:45~ventitre e quarantacinque minuti \ No newline at end of file +12:30~dodici e trenta minuti~dodici e mezza +05:15~cinque e quindici minuti~cinque e un quarto +17:15:26~diciassette e quindici minuti e ventisei secondi~diciassette e un quarto e ventisei secondi +23:45~ventitre e quarantacinque minuti +03:38~tre e trentotto minuti +l'evento inizia alle 16:00~l'evento inizia alle sedici \ No newline at end of file diff --git a/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh index 44f77fe2d..c8285be97 100644 --- a/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh @@ -8,10 +8,14 @@ runtest () { # read test file while read testcase; do - IFS='~' read written spoken <<< $testcase - # replace non breaking space with breaking space - denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + IFS='~' read -a testcase_tokenized <<< $testcase + written=${testcase_tokenized[0]} + # only tests against first possible option when there are multiple shortest paths + spoken=${testcase_tokenized[1]} + # replace non breaking space with breaking space + denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + # trim white space spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" diff --git a/tests/nemo_text_processing/it/test_time.py b/tests/nemo_text_processing/it/test_time.py index c9abb76a7..840d2a9b0 100644 --- a/tests/nemo_text_processing/it/test_time.py +++ b/tests/nemo_text_processing/it/test_time.py @@ -27,4 +27,4 @@ class TestChar: @pytest.mark.unit def test_norm_char(self, test_input, expected): preds = self.normalizer.normalize(test_input, punct_post_process=True) - assert expected == preds + assert preds in expected From e67e17c5a52245b73631f75fc51c55afaa7e25d4 Mon Sep 17 00:00:00 2001 From: Mariana <47233618+mgrafu@users.noreply.github.com> Date: Fri, 27 Oct 2023 15:41:37 -0400 Subject: [PATCH 02/90] add single letter exception for roman numerals (#121) * add single letter exception for roman numerals Signed-off-by: Mariana Graterol Fuenmayor * update ci dir Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- .../es/data/ordinals/roman_exceptions.tsv | 16 +++++++++++++++- .../test_cases_ordinal.txt | 3 ++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 7aa0ff575..430544f8b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -14,7 +14,7 @@ pipeline { AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-23-23-0' DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-14-23-0' - ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-29-23-0' + ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0' ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-1' FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-16-23-1' HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' diff --git a/nemo_text_processing/text_normalization/es/data/ordinals/roman_exceptions.tsv b/nemo_text_processing/text_normalization/es/data/ordinals/roman_exceptions.tsv index 536a862a2..f82d7e6c3 100644 --- a/nemo_text_processing/text_normalization/es/data/ordinals/roman_exceptions.tsv +++ b/nemo_text_processing/text_normalization/es/data/ordinals/roman_exceptions.tsv @@ -6,4 +6,18 @@ Mi MI vi Vi -VI \ No newline at end of file +VI +I +i +V +v +X +x +L +l +C +c +D +d +M +m \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/es/data_text_normalization/test_cases_ordinal.txt index b39ccf4b8..33737f4ed 100644 --- a/tests/nemo_text_processing/es/data_text_normalization/test_cases_ordinal.txt +++ b/tests/nemo_text_processing/es/data_text_normalization/test_cases_ordinal.txt @@ -113,4 +113,5 @@ 1ro~primero 1ra~primera maría vii~maría séptima~maría séptimo -todo mi reconocimiento~todo mi reconocimiento \ No newline at end of file +todo mi reconocimiento~todo mi reconocimiento +V~V \ No newline at end of file From c2b9e0addae359ef4390ae1fbf415ab13a49ab6e Mon Sep 17 00:00:00 2001 From: Mariana <47233618+mgrafu@users.noreply.github.com> Date: Fri, 3 Nov 2023 01:23:33 -0400 Subject: [PATCH 03/90] fix broken path for nondet whitelist (#124) Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Alex Cui --- nemo_text_processing/text_normalization/it/taggers/whitelist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/it/taggers/whitelist.py b/nemo_text_processing/text_normalization/it/taggers/whitelist.py index d8ad93e94..daade2828 100644 --- a/nemo_text_processing/text_normalization/it/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/it/taggers/whitelist.py @@ -56,7 +56,7 @@ def _get_whitelist_graph(input_case, file): graph = whitelist_provided if not deterministic: - units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measures/measurements.tsv")) + units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measure/measurements.tsv")) graph |= units_graph self.graph = graph From 41b21e27c7af4c5c95143b53b083c83a5d4b020a Mon Sep 17 00:00:00 2001 From: anand-nv <105917641+anand-nv@users.noreply.github.com> Date: Wed, 22 Nov 2023 04:17:25 +0530 Subject: [PATCH 04/90] Increase weights for serial (en TN) (#128) * Increase weights for serial (en TN) Resolves https://github.com/NVIDIA/NeMo-text-processing/issues/126 Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> * Add tests for fix Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> * Update Jenkinsfile cache path Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> * Update Jenkinsfile. Fix cache folder Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> --------- Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- .../text_normalization/en/taggers/tokenize_and_classify.py | 2 +- .../en/data_text_normalization/test_cases_money.txt | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 430544f8b..83284c592 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -13,7 +13,7 @@ pipeline { AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-23-23-0' DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-14-23-0' + EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-18-23-0' ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0' ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-1' FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-16-23-1' diff --git a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py index b83878f80..b3ac3ed75 100644 --- a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py @@ -173,7 +173,7 @@ def __init__( | pynutil.add_weight(electonic_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.1) | pynutil.add_weight(range_graph, 1.1) - | pynutil.add_weight(serial_graph, 1.1001) # should be higher than the rest of the classes + | pynutil.add_weight(serial_graph, 1.1003) # should be higher than the rest of the classes ) # roman_graph = RomanFst(deterministic=deterministic).fst diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_money.txt index d1218e8bd..6fcef0ea5 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_money.txt @@ -63,3 +63,4 @@ $1,925.21~one thousand nine hundred and twenty five dollars twenty one cents $1,234.123~one thousand two hundred and thirty four point one two three dollars US $76.3 trillion~US seventy six point three trillion dollars US$76.3 trillion~seventy six point three trillion us dollars +The price for each canned salmon is $5, each bottle of peanut butter is $3~The price for each canned salmon is five dollars, each bottle of peanut butter is three dollars From 230b21e28887c47b63baa10ef1836c6fb94cda58 Mon Sep 17 00:00:00 2001 From: Mariana <47233618+mgrafu@users.noreply.github.com> Date: Thu, 7 Dec 2023 22:17:32 -0500 Subject: [PATCH 05/90] add measures file for FR TN (#131) * add measures file Signed-off-by: Mariana Graterol Fuenmayor * update whitelist data Signed-off-by: Mariana Graterol Fuenmayor * add fr tn tests Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- .../fr/data/measures/__init__.py | 13 +++++ .../fr/data/measures/measurements.tsv | 17 +++++++ .../text_normalization/fr/data/whitelist.tsv | 13 ++++- .../test_cases_whitelist.txt | 6 +++ .../test_cases_word.txt | 49 +++++++++++++++++++ .../fr/test_sparrowhawk_normalization.sh | 10 ++++ .../nemo_text_processing/fr/test_whitelist.py | 10 ++++ tests/nemo_text_processing/fr/test_word.py | 10 ++++ 9 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 nemo_text_processing/text_normalization/fr/data/measures/__init__.py create mode 100644 nemo_text_processing/text_normalization/fr/data/measures/measurements.tsv create mode 100644 tests/nemo_text_processing/fr/data_text_normalization/test_cases_whitelist.txt create mode 100644 tests/nemo_text_processing/fr/data_text_normalization/test_cases_word.txt diff --git a/Jenkinsfile b/Jenkinsfile index 83284c592..346426884 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -16,7 +16,7 @@ pipeline { EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-18-23-0' ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0' ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-1' - FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-16-23-1' + FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/12-05-23-0' HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' diff --git a/nemo_text_processing/text_normalization/fr/data/measures/__init__.py b/nemo_text_processing/text_normalization/fr/data/measures/__init__.py new file mode 100644 index 000000000..6ebc808fa --- /dev/null +++ b/nemo_text_processing/text_normalization/fr/data/measures/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/fr/data/measures/measurements.tsv b/nemo_text_processing/text_normalization/fr/data/measures/measurements.tsv new file mode 100644 index 000000000..55f0d4643 --- /dev/null +++ b/nemo_text_processing/text_normalization/fr/data/measures/measurements.tsv @@ -0,0 +1,17 @@ +m mètres +m² mètres carrés +m³ mètres cubes +s secondes +min minutes +h heures +° degrés +°C degrés celsius +g grammes +l litres +kg kilos +'' pouce +lb livres +% pour cent +‰ pour mille +km/h kilomètres heure +m/h mètres à l’heure \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/fr/data/whitelist.tsv b/nemo_text_processing/text_normalization/fr/data/whitelist.tsv index 80bc9f6d4..f33457ba8 100644 --- a/nemo_text_processing/text_normalization/fr/data/whitelist.tsv +++ b/nemo_text_processing/text_normalization/fr/data/whitelist.tsv @@ -1 +1,12 @@ -bonjour \ No newline at end of file +Mᵐᵉ madame +Mᵐᵉˢ mesdames +Mˡˡᵉ mademoiselle +Mˡˡᵉˢ mademoiselles +Dʳ docteur +Dʳˢ docteurs +Dʳᵉ docteure +Dʳᵉˢ docteures +apr. J.-C. après jésus-christ +av. J.-C. avant Jésus-Christ +le hon. l’honorable +le très hon. le très hononrable \ No newline at end of file diff --git a/tests/nemo_text_processing/fr/data_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/fr/data_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..50997ed9b --- /dev/null +++ b/tests/nemo_text_processing/fr/data_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,6 @@ +Dʳ~docteur +Dʳᵉˢ~docteures +Mᵐᵉ~madame +Mᵐᵉˢ~mesdames +Mˡˡᵉ~mademoiselle +Mˡˡᵉˢ~mademoiselles \ No newline at end of file diff --git a/tests/nemo_text_processing/fr/data_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/fr/data_text_normalization/test_cases_word.txt new file mode 100644 index 000000000..81d8cfd0a --- /dev/null +++ b/tests/nemo_text_processing/fr/data_text_normalization/test_cases_word.txt @@ -0,0 +1,49 @@ +~ +yahoo!~yahoo! +20 !~vingt ! +x~x +—~— +aaa~aaa +aabach~aabach +aabenraa~aabenraa +aabye~aabye +aaccessed~aaccessed +aach~aach +aachen's~aachen's +aadri~aadri +aafia~aafia +aagaard~aagaard +aagadu~aagadu +aagard~aagard +aagathadi~aagathadi +aaghart's~aaghart's +aagnes~aagnes +aagomoni~aagomoni +aagon~aagon +aagoo~aagoo +aagot~aagot +aahar~aahar +aahh~aahh +aahperd~aahperd +aaibinterstate~aaibinterstate +aajab~aajab +aakasa~aakasa +aakervik~aakervik +aakirkeby~aakirkeby +aalam~aalam +aalbaek~aalbaek +aaldiu~aaldiu +aalem~aalem +a'ali~a'ali +aalilaassamthey~aalilaassamthey +aalin~aalin +aaliyan~aaliyan +aaliyan's~aaliyan's +aamadu~aamadu +aamara~aamara +aambala~aambala +aamera~aamera +aamer's~aamer's +aamina~aamina +aaminah~aaminah +aamjiwnaang~aamjiwnaang \ No newline at end of file diff --git a/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh index c845f44cd..0713d6f34 100644 --- a/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh @@ -40,5 +40,15 @@ testTNOrdinal() { runtest $input } +testTNWhitelist() { + input=$PROJECT_DIR/fr/data_text_normalization/test_cases_whitelist.txt + runtest $input +} + +testTNWord() { + input=$PROJECT_DIR/fr/data_text_normalization/test_cases_word.txt + runtest $input +} + # Load shUnit2 . $PROJECT_DIR/../shunit2/shunit2 \ No newline at end of file diff --git a/tests/nemo_text_processing/fr/test_whitelist.py b/tests/nemo_text_processing/fr/test_whitelist.py index 075584b3b..db9212278 100644 --- a/tests/nemo_text_processing/fr/test_whitelist.py +++ b/tests/nemo_text_processing/fr/test_whitelist.py @@ -15,6 +15,7 @@ import pytest from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file @@ -29,3 +30,12 @@ class TestWhitelist: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + + @parameterized.expand(parse_test_case_file('fr/data_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/fr/test_word.py b/tests/nemo_text_processing/fr/test_word.py index 46057117f..11b5b5791 100644 --- a/tests/nemo_text_processing/fr/test_word.py +++ b/tests/nemo_text_processing/fr/test_word.py @@ -15,6 +15,7 @@ import pytest from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file @@ -29,3 +30,12 @@ class TestWord: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + + @parameterized.expand(parse_test_case_file('fr/data_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected From c4f455361256f6036d623579d4821303726886d8 Mon Sep 17 00:00:00 2001 From: anand-nv <105917641+anand-nv@users.noreply.github.com> Date: Fri, 19 Jan 2024 23:29:26 +0530 Subject: [PATCH 06/90] Sh jenkins (#127) * Add SH tests to Jenkins Signed-off-by: Anand Joseph * Update cache paths Signed-off-by: Anand Joseph * Update Jenkins tests Signed-off-by: Anand Joseph * Add CI/CD tests for sparrowhawk Signed-off-by: Anand Joseph * docker build only if in test mode Signed-off-by: Anand Joseph * Fix missing variable Signed-off-by: Anand Joseph * Fix comments and remove arguments not required Signed-off-by: Anand Joseph * Fix commands not executing Signed-off-by: Anand Joseph * Missing arguments Signed-off-by: Anand Joseph * Missing quotes Signed-off-by: Anand Joseph * Fix incorrect path for tests Signed-off-by: Anand Joseph * Fix paths Signed-off-by: Anand Joseph * Incorrect paths of tests and shunit2 Signed-off-by: Anand Joseph * Fix issues with paths as arguments to shunit Signed-off-by: Anand Joseph * Undo path change Signed-off-by: Anand Joseph * Fix intentional fail test Signed-off-by: Anand Joseph * revert redundant check for cased option Signed-off-by: Anand Joseph * Fix default path in export_grammars.sh Signed-off-by: Anand Joseph * Update cache paths Signed-off-by: Anand Joseph * Add interactive option Signed-off-by: Anand Joseph * Add SH tests for cased EN ITN Signed-off-by: Anand Joseph --------- Signed-off-by: Anand Joseph Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Signed-off-by: Alex Cui --- Jenkinsfile | 35 ++++++++++- .../test_cases_measure.txt | 2 +- .../test_cases_whitelist.txt | 1 - ..._sparrowhawk_inverse_text_normalization.sh | 31 +++++----- ...owhawk_inverse_text_normalization_cased.sh | 54 +++++++++-------- .../en/test_sparrowhawk_normalization.sh | 44 +++++++------- .../docker/launch.sh | 10 ++-- .../export_grammars.sh | 38 +++++++++--- tools/text_processing_deployment/sh_test.sh | 58 +++++++++++++++++++ 9 files changed, 200 insertions(+), 73 deletions(-) create mode 100644 tools/text_processing_deployment/sh_test.sh diff --git a/Jenkinsfile b/Jenkinsfile index 346426884..d1b4062e4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -15,7 +15,7 @@ pipeline { DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-18-23-0' ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0' - ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-1' + ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-2' FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/12-05-23-0' HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' @@ -381,6 +381,39 @@ pipeline { } } + stage('L2: Sparrowhawk Tests') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + stages { + stage('L2: EN ITN Run Sparrowhawk test - Lower Cased Input') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" cd tools/text_processing_deployment && bash sh_test.sh --MODE="test_itn_grammars" --OVERWRITE_CACHE=False --FAR_PATH=${EN_TN_CACHE}/SH_ITN --LANGUAGE="en"' + sh 'CUDA_VISIBLE_DEVICES="" cd tests/nemo_text_processing/en && bash test_sparrowhawk_inverse_text_normalization.sh `pwd`' + + } + } + stage('L2: EN ITN Run Sparrowhawk test - Cased Input') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" cd tools/text_processing_deployment && bash sh_test.sh --MODE="test_itn_grammars" --INPUT_CASE="cased" --OVERWRITE_CACHE=False --FAR_PATH=${EN_TN_CACHE}/SH_ITN_cased --LANGUAGE="en"' + sh 'CUDA_VISIBLE_DEVICES="" cd tests/nemo_text_processing/en && bash test_sparrowhawk_inverse_text_normalization_cased.sh `pwd`' + + } + } + stage('L2: EN TN Run Sparrowhawk test') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" cd tools/text_processing_deployment && bash sh_test.sh --MODE="test_tn_grammars" --OVERWRITE_CACHE=False --FAR_PATH=${EN_TN_CACHE}/SH_TN --GRAMMARS="tn_grammars" --LANGUAGE="en" ' + sh 'CUDA_VISIBLE_DEVICES="" cd tests/nemo_text_processing/en && bash test_sparrowhawk_normalization.sh `pwd`' + } + } + + } + } + stage('L2: NeMo text processing') { when { anyOf { diff --git a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_measure.txt index e4d78e413..166bccc81 100644 --- a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_measure.txt @@ -75,7 +75,7 @@ eight hundred kilo watt hours~800 kWh eight hundred kilowatts~800 kW eight hundred megahertz~800 mhz eight hundred ninety four c c~894 cc -eight hundred ninety kilowatts~890 kW` +eight hundred ninety kilowatts~890 kW eight hundred ninety millimeters~890 mm eight hundred ninety two square kilometers~892 km² eight hundred seventy horsepower~870 hp diff --git a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_whitelist.txt index cca1dc440..8d62bcc43 100644 --- a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_whitelist.txt +++ b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_whitelist.txt @@ -7,7 +7,6 @@ s and p five hundred~S&P 500 seven eleven stores~7-eleven stores r t x~RTX cat five e~CAT5e -nvidia a one hundred~Nvidia A100 c u d n n~cuDNN p c i e x eight~PCIe x8 l g a eleven fifty~LGA 1150 diff --git a/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization.sh index 2633670c0..47f726ef4 100644 --- a/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization.sh @@ -1,9 +1,10 @@ #! /bin/sh -PROJECT_DIR=/workspace/tests +TEST_DIR=${1:-"/workspace/tests/en"} runtest () { input=$1 + echo "INPUT is $input" cd /workspace/sparrowhawk/documentation/grammars # read test file @@ -21,59 +22,63 @@ runtest () { } testITNCardinal() { - input=$PROJECT_DIR/en/data_inverse_text_normalization/test_cases_cardinal.txt + input=$TEST_DIR/data_inverse_text_normalization/test_cases_cardinal.txt runtest $input } testITNDate() { - input=$PROJECT_DIR/en/data_inverse_text_normalization/test_cases_date.txt + input=$TEST_DIR/data_inverse_text_normalization/test_cases_date.txt runtest $input } testITNDecimal() { - input=$PROJECT_DIR/en/data_inverse_text_normalization/test_cases_decimal.txt + input=$TEST_DIR/data_inverse_text_normalization/test_cases_decimal.txt runtest $input } testITNElectronic() { - input=$PROJECT_DIR/en/data_inverse_text_normalization/test_cases_electronic.txt + input=$TEST_DIR/data_inverse_text_normalization/test_cases_electronic.txt runtest $input } testITNOrdinal() { - input=$PROJECT_DIR/en/data_inverse_text_normalization/test_cases_ordinal.txt + input=$TEST_DIR/data_inverse_text_normalization/test_cases_ordinal.txt runtest $input } testITNTime() { - input=$PROJECT_DIR/en/data_inverse_text_normalization/test_cases_time.txt + input=$TEST_DIR/data_inverse_text_normalization/test_cases_time.txt runtest $input } testITNMeasure() { - input=$PROJECT_DIR/en/data_inverse_text_normalization/test_cases_measure.txt + input=$TEST_DIR/data_inverse_text_normalization/test_cases_measure.txt runtest $input } testITNMoney() { - input=$PROJECT_DIR/en/data_inverse_text_normalization/test_cases_money.txt + input=$TEST_DIR/data_inverse_text_normalization/test_cases_money.txt runtest $input } testITNWhitelist() { - input=$PROJECT_DIR/en/data_inverse_text_normalization/test_cases_whitelist.txt + input=$TEST_DIR/data_inverse_text_normalization/test_cases_whitelist.txt runtest $input } testITNTelephone() { - input=$PROJECT_DIR/en/data_inverse_text_normalization/test_cases_telephone.txt + input=$TEST_DIR/data_inverse_text_normalization/test_cases_telephone.txt runtest $input } testITNWord() { - input=$PROJECT_DIR/en/data_inverse_text_normalization/test_cases_word.txt + input=$TEST_DIR/data_inverse_text_normalization/test_cases_word.txt runtest $input } + +# Remove all command-line arguments +shift $# + # Load shUnit2 -. $PROJECT_DIR/../shunit2/shunit2 +. /workspace/shunit2/shunit2 diff --git a/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization_cased.sh b/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization_cased.sh index a7d926357..58407fcde 100644 --- a/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization_cased.sh +++ b/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization_cased.sh @@ -1,9 +1,11 @@ #! /bin/sh -PROJECT_DIR=/workspace/tests +TEST_DIR=${1:-"/workspace/tests/en"} runtest () { input=$1 + echo "INPUT is $input" + cd /workspace/sparrowhawk/documentation/grammars # read test file @@ -21,59 +23,63 @@ runtest () { } testITNCardinal() { - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_cardinal.txt - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_cardinal_cased.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_cardinal.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_cardinal_cased.txt } testITNDate() { - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_date.txt - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_date_cased.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_date.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_date_cased.txt } testITNDecimal() { - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_decimal.txt - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_decimal_cased.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_decimal.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_decimal_cased.txt } testITNElectronic() { - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_electronic.txt - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_electronic_cased.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_electronic.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_electronic_cased.txt } testITNOrdinal() { - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_ordinal.txt - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_ordinal_cased.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_ordinal.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_ordinal_cased.txt } testITNTime() { - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_time.txt - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_time_cased.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_time.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_time_cased.txt } testITNMeasure() { - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_measure.txt - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_measure_cased.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_measure.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_measure_cased.txt } testITNMoney() { - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_money.txt - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_money_cased.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_money.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_money_cased.txt } testITNWhitelist() { - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_whitelist.txt - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_whitelist_cased.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_whitelist.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_whitelist_cased.txt } testITNTelephone() { - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_telephone.txt - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_telephone_cased.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_telephone.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_telephone_cased.txt } testITNWord() { - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_word.txt - runtest $PROJECT_DIR/en/data_inverse_text_normalization/test_cases_word_cased.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_word.txt + runtest $TEST_DIR/data_inverse_text_normalization/test_cases_word_cased.txt } + +# Remove all command-line arguments +shift $# + # Load shUnit2 -. $PROJECT_DIR/../shunit2/shunit2 +. /workspace/shunit2/shunit2 diff --git a/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh index 7baa7c198..1969d64e9 100644 --- a/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh @@ -1,6 +1,5 @@ #! /bin/sh - -PROJECT_DIR=/workspace/tests +TEST_DIR=${1:-"/workspace/tests/en"} runtest () { input=$1 @@ -22,94 +21,97 @@ runtest () { } testTNSpecialText() { - input=$PROJECT_DIR/en/data_text_normalization/test_cases_special_text.txt + input=$TEST_DIR/data_text_normalization/test_cases_special_text.txt runtest $input } testTNCardinal() { - input=$PROJECT_DIR/en/data_text_normalization/test_cases_cardinal.txt + input=$TEST_DIR/data_text_normalization/test_cases_cardinal.txt runtest $input } testTNDate() { - input=$PROJECT_DIR/en/data_text_normalization/test_cases_date.txt + input=$TEST_DIR/data_text_normalization/test_cases_date.txt runtest $input } testTNDecimal() { - input=$PROJECT_DIR/en/data_text_normalization/test_cases_decimal.txt + input=$TEST_DIR/data_text_normalization/test_cases_decimal.txt runtest $input } testTNRange() { - input=$PROJECT_DIR/en/data_text_normalization/test_cases_range.txt + input=$TEST_DIR/data_text_normalization/test_cases_range.txt runtest $input } testTNSerial() { - input=$PROJECT_DIR/en/data_text_normalization/test_cases_serial.txt + input=$TEST_DIR/data_text_normalization/test_cases_serial.txt runtest $input } #testTNRoman() { -# input=$PROJECT_DIR/en/data_text_normalization/test_cases_roman.txt +# input=$TEST_DIR/data_text_normalization/test_cases_roman.txt # runtest $input #} testTNElectronic() { - input=$PROJECT_DIR/en/data_text_normalization/test_cases_electronic.txt + input=$TEST_DIR/data_text_normalization/test_cases_electronic.txt runtest $input } testTNFraction() { - input=$PROJECT_DIR/en/data_text_normalization/test_cases_fraction.txt + input=$TEST_DIR/data_text_normalization/test_cases_fraction.txt runtest $input } testTNMoney() { - input=$PROJECT_DIR/en/data_text_normalization/test_cases_money.txt + input=$TEST_DIR/data_text_normalization/test_cases_money.txt runtest $input } testTNOrdinal() { - input=$PROJECT_DIR/en/data_text_normalization/test_cases_ordinal.txt + input=$TEST_DIR/data_text_normalization/test_cases_ordinal.txt runtest $input } testTNTelephone() { - input=$PROJECT_DIR/en/data_text_normalization/test_cases_telephone.txt + input=$TEST_DIR/data_text_normalization/test_cases_telephone.txt runtest $input } testTNTime() { - input=$PROJECT_DIR/en/data_text_normalization/test_cases_time.txt + input=$TEST_DIR/data_text_normalization/test_cases_time.txt runtest $input } testTNMeasure() { - input=$PROJECT_DIR/en/data_text_normalization/test_cases_measure.txt + input=$TEST_DIR/data_text_normalization/test_cases_measure.txt runtest $input } testTNWhitelist() { - input=$PROJECT_DIR/en/data_text_normalization/test_cases_whitelist.txt + input=$TEST_DIR/data_text_normalization/test_cases_whitelist.txt runtest $input } testTNWord() { - input=$PROJECT_DIR/en/data_text_normalization/test_cases_word.txt + input=$TEST_DIR/data_text_normalization/test_cases_word.txt runtest $input } testTNAddress() { - input=$PROJECT_DIR/en/data_text_normalization/test_cases_address.txt + input=$TEST_DIR/data_text_normalization/test_cases_address.txt runtest $input } testTNMath() { - input=$PROJECT_DIR/en/data_text_normalization/test_cases_math.txt + input=$TEST_DIR/data_text_normalization/test_cases_math.txt runtest $input } +# Remove all command-line arguments +shift $# + # Load shUnit2 -. $PROJECT_DIR/../shunit2/shunit2 +. /workspace/shunit2/shunit2 diff --git a/tools/text_processing_deployment/docker/launch.sh b/tools/text_processing_deployment/docker/launch.sh index 09d1bb523..1ba641ac0 100644 --- a/tools/text_processing_deployment/docker/launch.sh +++ b/tools/text_processing_deployment/docker/launch.sh @@ -14,13 +14,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -MODE=${1:-"export"} +MODE=${1:-"interactive"} LANGUAGE=${2:-"en"} INPUT_CASE=${3:-"lower_cased"} SCRIPT_DIR=$(cd $(dirname $0); pwd) -: ${CLASSIFY_DIR:="$SCRIPT_DIR/../$LANGUAGE/classify"} -: ${VERBALIZE_DIR:="$SCRIPT_DIR/../$LANGUAGE/verbalize"} -: ${CMD:=${4:-"/bin/bash"}} +GRAMMAR_DIR=${4:-${SCRIPT_DIR}"/.."} + +: ${CLASSIFY_DIR:="$GRAMMAR_DIR/$LANGUAGE/classify"} +: ${VERBALIZE_DIR:="$GRAMMAR_DIR/$LANGUAGE/verbalize"} +: ${CMD:=${5:-"/bin/bash"}} MOUNTS="" MOUNTS+=" -v $CLASSIFY_DIR:/workspace/sparrowhawk/documentation/grammars/en_toy/classify" diff --git a/tools/text_processing_deployment/export_grammars.sh b/tools/text_processing_deployment/export_grammars.sh index b36ae3a03..d44f876e6 100644 --- a/tools/text_processing_deployment/export_grammars.sh +++ b/tools/text_processing_deployment/export_grammars.sh @@ -33,10 +33,12 @@ GRAMMARS="itn_grammars" # tn_grammars INPUT_CASE="lower_cased" # cased LANGUAGE="en" # language, {'en', 'es', 'de','zh'} supports both TN and ITN, {'pt', 'ru', 'fr', 'vi'} supports ITN only -MODE="export" +MODE="export" # default is one of {'export', 'interactive', 'test', 'ci'}. Default "export" OVERWRITE_CACHE="True" # Set to False to re-use .far files FORCE_REBUILD="False" # Set to True to re-build docker file WHITELIST=None # Path to a whitelist file, if None the default will be used +FAR_PATH=$(pwd) # Path where the grammars should be written +SKIP_FAR_CREATION="False" for ARG in "$@" do @@ -50,7 +52,7 @@ do done -CACHE_DIR=${LANGUAGE} +CACHE_DIR=${FAR_PATH}/${LANGUAGE} echo "GRAMMARS = $GRAMMARS" echo "MODE = $MODE" echo "LANGUAGE = $LANGUAGE" @@ -61,11 +63,24 @@ echo "FORCE_REBUILD = $FORCE_REBUILD" echo "WHITELIST = $WHITELIST" -if [[ ${OVERWRITE_CACHE,,} == "true" ]]; then +if [[ ${OVERWRITE_CACHE,,} == "true" ]] ; then OVERWRITE_CACHE="--overwrite_cache " - python3 pynini_export.py --output_dir=. --grammars=${GRAMMARS} --input_case=${INPUT_CASE} \ + SKIP_FAR_CREATION="True" +else + OVERWRITE_CACHE="" +fi + +CLASSIFY_FAR=${CACHE_DIR}"/classify/tokenize_and_classify.far" +VERBALIZE_FAR=${CACHE_DIR}"/verbalize/verbalize.far" + +if [[ -f $CLASSIFY_FAR ]] && [[ -f $VERBALIZE_FAR ]] && [[ ${OVERWRITE_CACHE} == "" ]]; then + SKIP_FAR_CREATION="True" + echo "Far files exists and OVERWRITE_CACHE is set to False" +fi + +if [[ ${SKIP_FAR_CREATION} != "True" ]]; then + python3 pynini_export.py --output_dir=${FAR_PATH} --grammars=${GRAMMARS} --input_case=${INPUT_CASE} \ --language=${LANGUAGE} --cache_dir=${CACHE_DIR} --whitelist=${WHITELIST} ${OVERWRITE_CACHE} || exit 1 - else OVERWRITE_CACHE="" fi if [[ ${FORCE_REBUILD,,} == "true" ]]; then @@ -74,10 +89,17 @@ if [[ ${FORCE_REBUILD,,} == "true" ]]; then fi find . -name "Makefile" -type f -delete -bash docker/build.sh $FORCE_REBUILD -if [[ ${MODE} == "test" ]]; then + + + + +if [[ ${MODE} == "test" ]] || [[ ${MODE} == "interactive" ]]; then MODE=${MODE}_${GRAMMARS} + bash docker/build.sh $FORCE_REBUILD + bash docker/launch.sh $MODE $LANGUAGE $INPUT_CASE $FAR_PATH +else + exit 0 fi -bash docker/launch.sh $MODE $LANGUAGE $INPUT_CASE + diff --git a/tools/text_processing_deployment/sh_test.sh b/tools/text_processing_deployment/sh_test.sh new file mode 100644 index 000000000..b66686991 --- /dev/null +++ b/tools/text_processing_deployment/sh_test.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script runs the CI/CD tests for Sparrowhawk. It calls export_grammars.sh to create the grammars. + + +GRAMMARS="itn_grammars" # tn_grammars +INPUT_CASE="lower_cased" # cased +LANGUAGE="en" # language, {'en', 'es', 'de','zh'} supports both TN and ITN, {'pt', 'ru', 'fr', 'vi'} supports ITN only +OVERWRITE_CACHE="False" # Set to False to re-use .far files +WHITELIST=None # Path to a whitelist file, if None the default will be used +FAR_PATH=$(pwd) # Path where the grammars should be written +MODE="test_itn_grammars" + +for ARG in "$@" +do + key=$(echo $ARG | cut -f1 -d=) + value=$(echo $ARG | cut -f2 -d=) + + if [[ $key == *"--"* ]]; then + v="${key/--/}" + declare $v="${value}" + fi +done + + +CACHE_DIR=${FAR_PATH}/${LANGUAGE} +echo "GRAMMARS = $GRAMMARS" +echo "LANGUAGE = $LANGUAGE" +echo "INPUT_CASE = $INPUT_CASE" +echo "CACHE_DIR = $CACHE_DIR" +echo "OVERWRITE_CACHE = $OVERWRITE_CACHE" +echo "FORCE_REBUILD = $FORCE_REBUILD" +echo "WHITELIST = $WHITELIST" + +bash export_grammars.sh --MODE="export" --GRAMMARS=$GRAMMARS --LANGUAGE=$LANGUAGE --INPUT_CASE=$INPUT_CASE \ + --FAR_PATH=$FAR_PATH --CACHE_DIR=$CACHE_DIR --OVERWRITE_CACHE=$OVERWRITE_CACHE --FORCE_REBUILD=$FORCE_REBUILD \ + --WHITELIST=$WHITELIST + +CLASSIFY_FAR=${CACHE_DIR}"/classify/tokenize_and_classify.far" +VERBALIZE_FAR=${CACHE_DIR}"/verbalize/verbalize.far" + +cp $CLASSIFY_FAR /workspace/sparrowhawk/documentation/grammars/en_toy/classify/ +cp $VERBALIZE_FAR /workspace/sparrowhawk/documentation/grammars/en_toy/verbalize/ + From 5561c48c13e3a7773cae0d9627a4df8cda7cea2c Mon Sep 17 00:00:00 2001 From: Evelina <10428420+ekmb@users.noreply.github.com> Date: Wed, 14 Feb 2024 14:28:26 -0800 Subject: [PATCH 07/90] update isort - fix precommit (#138) * update isort version Signed-off-by: Evelina * update isort version Signed-off-by: Evelina * fix format Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove unused imports Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui --- .pre-commit-config.yaml | 2 +- nemo_text_processing/fst_alignment/alignment.py | 1 - nemo_text_processing/hybrid/model_utils.py | 3 ++- nemo_text_processing/hybrid/utils.py | 5 +++-- nemo_text_processing/hybrid/wfst_lm_rescoring.py | 3 ++- .../inverse_text_normalization/ar/__init__.py | 3 ++- .../inverse_text_normalization/ar/taggers/__init__.py | 3 ++- .../inverse_text_normalization/ar/taggers/cardinal.py | 3 ++- .../inverse_text_normalization/ar/taggers/decimal.py | 3 ++- .../inverse_text_normalization/ar/taggers/fraction.py | 3 ++- .../inverse_text_normalization/ar/taggers/measure.py | 3 ++- .../inverse_text_normalization/ar/taggers/money.py | 3 ++- .../inverse_text_normalization/ar/taggers/punctuation.py | 3 ++- .../ar/taggers/tokenize_and_classify.py | 3 ++- .../inverse_text_normalization/ar/taggers/word.py | 3 ++- .../ar/verbalizers/__init__.py | 1 + .../ar/verbalizers/cardinal.py | 3 ++- .../inverse_text_normalization/ar/verbalizers/decimal.py | 3 ++- .../ar/verbalizers/fraction.py | 3 ++- .../inverse_text_normalization/ar/verbalizers/measure.py | 3 ++- .../inverse_text_normalization/ar/verbalizers/money.py | 3 ++- .../ar/verbalizers/verbalize_final.py | 3 ++- .../inverse_text_normalization/ar/verbalizers/word.py | 3 ++- .../inverse_text_normalization/de/taggers/cardinal.py | 3 ++- .../inverse_text_normalization/de/taggers/date.py | 3 ++- .../inverse_text_normalization/de/taggers/decimal.py | 3 ++- .../inverse_text_normalization/de/taggers/electronic.py | 3 ++- .../inverse_text_normalization/de/taggers/fraction.py | 3 ++- .../inverse_text_normalization/de/taggers/measure.py | 3 ++- .../inverse_text_normalization/de/taggers/money.py | 3 ++- .../inverse_text_normalization/de/taggers/ordinal.py | 3 ++- .../inverse_text_normalization/de/taggers/telephone.py | 3 ++- .../inverse_text_normalization/de/taggers/time.py | 3 ++- .../de/taggers/tokenize_and_classify.py | 3 ++- .../inverse_text_normalization/de/taggers/whitelist.py | 3 ++- .../de/verbalizers/cardinal.py | 3 ++- .../inverse_text_normalization/de/verbalizers/decimal.py | 3 ++- .../inverse_text_normalization/de/verbalizers/measure.py | 3 ++- .../inverse_text_normalization/de/verbalizers/money.py | 3 ++- .../inverse_text_normalization/de/verbalizers/time.py | 3 ++- .../de/verbalizers/verbalize_final.py | 3 ++- .../inverse_text_normalization/en/clean_eval_data.py | 2 +- .../inverse_text_normalization/en/taggers/cardinal.py | 3 ++- .../inverse_text_normalization/en/taggers/date.py | 3 ++- .../inverse_text_normalization/en/taggers/decimal.py | 3 ++- .../inverse_text_normalization/en/taggers/electronic.py | 3 ++- .../inverse_text_normalization/en/taggers/measure.py | 3 ++- .../inverse_text_normalization/en/taggers/money.py | 3 ++- .../inverse_text_normalization/en/taggers/ordinal.py | 3 ++- .../inverse_text_normalization/en/taggers/punctuation.py | 3 ++- .../inverse_text_normalization/en/taggers/telephone.py | 3 ++- .../inverse_text_normalization/en/taggers/time.py | 3 ++- .../en/taggers/tokenize_and_classify.py | 3 ++- .../inverse_text_normalization/en/taggers/whitelist.py | 3 ++- .../inverse_text_normalization/en/taggers/word.py | 3 ++- .../en/verbalizers/cardinal.py | 3 ++- .../inverse_text_normalization/en/verbalizers/date.py | 3 ++- .../inverse_text_normalization/en/verbalizers/decimal.py | 3 ++- .../en/verbalizers/electronic.py | 3 ++- .../inverse_text_normalization/en/verbalizers/measure.py | 3 ++- .../inverse_text_normalization/en/verbalizers/money.py | 3 ++- .../inverse_text_normalization/en/verbalizers/ordinal.py | 3 ++- .../en/verbalizers/telephone.py | 3 ++- .../inverse_text_normalization/en/verbalizers/time.py | 3 ++- .../en/verbalizers/verbalize_final.py | 3 ++- .../en/verbalizers/whitelist.py | 3 ++- .../inverse_text_normalization/en/verbalizers/word.py | 3 ++- .../inverse_text_normalization/es/graph_utils.py | 3 ++- .../inverse_text_normalization/es/taggers/cardinal.py | 3 ++- .../inverse_text_normalization/es/taggers/date.py | 3 ++- .../inverse_text_normalization/es/taggers/decimal.py | 3 ++- .../inverse_text_normalization/es/taggers/electronic.py | 3 ++- .../inverse_text_normalization/es/taggers/fraction.py | 3 ++- .../inverse_text_normalization/es/taggers/measure.py | 3 ++- .../inverse_text_normalization/es/taggers/money.py | 3 ++- .../inverse_text_normalization/es/taggers/ordinal.py | 3 ++- .../inverse_text_normalization/es/taggers/punctuation.py | 3 ++- .../inverse_text_normalization/es/taggers/telephone.py | 3 ++- .../inverse_text_normalization/es/taggers/time.py | 3 ++- .../es/taggers/tokenize_and_classify.py | 3 ++- .../inverse_text_normalization/es/taggers/whitelist.py | 3 ++- .../inverse_text_normalization/es/taggers/word.py | 3 ++- .../es/verbalizers/cardinal.py | 3 ++- .../inverse_text_normalization/es/verbalizers/date.py | 3 ++- .../inverse_text_normalization/es/verbalizers/decimal.py | 3 ++- .../es/verbalizers/electronic.py | 3 ++- .../es/verbalizers/fraction.py | 3 ++- .../inverse_text_normalization/es/verbalizers/measure.py | 3 ++- .../inverse_text_normalization/es/verbalizers/money.py | 3 ++- .../inverse_text_normalization/es/verbalizers/ordinal.py | 3 ++- .../es/verbalizers/telephone.py | 3 ++- .../inverse_text_normalization/es/verbalizers/time.py | 3 ++- .../es/verbalizers/verbalize_final.py | 3 ++- .../es/verbalizers/whitelist.py | 3 ++- .../inverse_text_normalization/es/verbalizers/word.py | 3 ++- .../inverse_text_normalization/es_en/graph_utils.py | 3 ++- .../es_en/taggers/tokenize_and_classify.py | 3 ++- .../es_en/verbalizers/verbalize.py | 3 ++- .../es_en/verbalizers/verbalize_final.py | 3 ++- .../inverse_text_normalization/fr/graph_utils.py | 5 +++-- .../inverse_text_normalization/fr/taggers/cardinal.py | 3 ++- .../inverse_text_normalization/fr/taggers/date.py | 3 ++- .../inverse_text_normalization/fr/taggers/decimal.py | 3 ++- .../inverse_text_normalization/fr/taggers/electronic.py | 3 ++- .../inverse_text_normalization/fr/taggers/fraction.py | 3 ++- .../inverse_text_normalization/fr/taggers/measure.py | 3 ++- .../inverse_text_normalization/fr/taggers/money.py | 3 ++- .../inverse_text_normalization/fr/taggers/ordinal.py | 3 ++- .../inverse_text_normalization/fr/taggers/punctuation.py | 3 ++- .../inverse_text_normalization/fr/taggers/telephone.py | 3 ++- .../inverse_text_normalization/fr/taggers/time.py | 3 ++- .../fr/taggers/tokenize_and_classify.py | 3 ++- .../inverse_text_normalization/fr/taggers/whitelist.py | 3 ++- .../inverse_text_normalization/fr/taggers/word.py | 3 ++- .../fr/verbalizers/cardinal.py | 3 ++- .../inverse_text_normalization/fr/verbalizers/date.py | 3 ++- .../inverse_text_normalization/fr/verbalizers/decimal.py | 3 ++- .../fr/verbalizers/electronic.py | 3 ++- .../fr/verbalizers/fraction.py | 3 ++- .../inverse_text_normalization/fr/verbalizers/measure.py | 3 ++- .../inverse_text_normalization/fr/verbalizers/money.py | 3 ++- .../inverse_text_normalization/fr/verbalizers/ordinal.py | 3 ++- .../fr/verbalizers/telephone.py | 3 ++- .../inverse_text_normalization/fr/verbalizers/time.py | 3 ++- .../fr/verbalizers/verbalize_final.py | 3 ++- .../fr/verbalizers/whitelist.py | 3 ++- .../inverse_text_normalization/fr/verbalizers/word.py | 3 ++- .../inverse_text_normalization/pt/taggers/cardinal.py | 3 ++- .../inverse_text_normalization/pt/taggers/date.py | 3 ++- .../inverse_text_normalization/pt/taggers/decimal.py | 3 ++- .../inverse_text_normalization/pt/taggers/electronic.py | 3 ++- .../inverse_text_normalization/pt/taggers/measure.py | 3 ++- .../inverse_text_normalization/pt/taggers/money.py | 3 ++- .../inverse_text_normalization/pt/taggers/ordinal.py | 3 ++- .../inverse_text_normalization/pt/taggers/punctuation.py | 3 ++- .../inverse_text_normalization/pt/taggers/telephone.py | 3 ++- .../inverse_text_normalization/pt/taggers/time.py | 3 ++- .../pt/taggers/tokenize_and_classify.py | 3 ++- .../inverse_text_normalization/pt/taggers/whitelist.py | 3 ++- .../inverse_text_normalization/pt/taggers/word.py | 3 ++- .../pt/verbalizers/cardinal.py | 3 ++- .../inverse_text_normalization/pt/verbalizers/date.py | 3 ++- .../inverse_text_normalization/pt/verbalizers/decimal.py | 3 ++- .../pt/verbalizers/electronic.py | 3 ++- .../inverse_text_normalization/pt/verbalizers/measure.py | 3 ++- .../inverse_text_normalization/pt/verbalizers/money.py | 3 ++- .../inverse_text_normalization/pt/verbalizers/ordinal.py | 3 ++- .../pt/verbalizers/telephone.py | 3 ++- .../inverse_text_normalization/pt/verbalizers/time.py | 3 ++- .../pt/verbalizers/verbalize_final.py | 3 ++- .../pt/verbalizers/whitelist.py | 3 ++- .../inverse_text_normalization/pt/verbalizers/word.py | 3 ++- .../inverse_text_normalization/ru/taggers/cardinal.py | 3 ++- .../inverse_text_normalization/ru/taggers/date.py | 3 ++- .../inverse_text_normalization/ru/taggers/decimals.py | 3 ++- .../inverse_text_normalization/ru/taggers/electronic.py | 3 ++- .../inverse_text_normalization/ru/taggers/measure.py | 3 ++- .../inverse_text_normalization/ru/taggers/money.py | 3 ++- .../inverse_text_normalization/ru/taggers/ordinal.py | 3 ++- .../inverse_text_normalization/ru/taggers/telephone.py | 3 ++- .../inverse_text_normalization/ru/taggers/time.py | 3 ++- .../ru/taggers/tokenize_and_classify.py | 3 ++- .../inverse_text_normalization/ru/taggers/whitelist.py | 3 ++- .../ru/verbalizers/cardinal.py | 3 ++- .../inverse_text_normalization/ru/verbalizers/date.py | 3 ++- .../inverse_text_normalization/ru/verbalizers/decimal.py | 3 ++- .../ru/verbalizers/electronic.py | 3 ++- .../inverse_text_normalization/ru/verbalizers/measure.py | 3 ++- .../inverse_text_normalization/ru/verbalizers/money.py | 3 ++- .../inverse_text_normalization/ru/verbalizers/ordinal.py | 3 ++- .../ru/verbalizers/telephone.py | 3 ++- .../inverse_text_normalization/ru/verbalizers/time.py | 3 ++- .../ru/verbalizers/verbalize_final.py | 3 ++- .../inverse_text_normalization/run_evaluate.py | 1 - .../inverse_text_normalization/sv/taggers/cardinal.py | 3 ++- .../inverse_text_normalization/sv/taggers/date.py | 3 ++- .../inverse_text_normalization/sv/taggers/decimal.py | 3 ++- .../inverse_text_normalization/sv/taggers/electronic.py | 3 ++- .../inverse_text_normalization/sv/taggers/fraction.py | 3 ++- .../inverse_text_normalization/sv/taggers/ordinal.py | 3 ++- .../inverse_text_normalization/sv/taggers/telephone.py | 3 ++- .../inverse_text_normalization/sv/taggers/time.py | 3 ++- .../sv/taggers/tokenize_and_classify.py | 3 ++- .../inverse_text_normalization/sv/taggers/whitelist.py | 3 ++- .../sv/verbalizers/cardinal.py | 3 ++- .../inverse_text_normalization/sv/verbalizers/date.py | 3 ++- .../inverse_text_normalization/sv/verbalizers/decimal.py | 3 ++- .../inverse_text_normalization/sv/verbalizers/time.py | 3 ++- .../sv/verbalizers/verbalize_final.py | 3 ++- .../inverse_text_normalization/vi/graph_utils.py | 3 ++- .../inverse_text_normalization/vi/taggers/cardinal.py | 3 ++- .../inverse_text_normalization/vi/taggers/date.py | 3 ++- .../inverse_text_normalization/vi/taggers/decimal.py | 3 ++- .../inverse_text_normalization/vi/taggers/electronic.py | 3 ++- .../inverse_text_normalization/vi/taggers/fraction.py | 3 ++- .../inverse_text_normalization/vi/taggers/measure.py | 3 ++- .../inverse_text_normalization/vi/taggers/money.py | 3 ++- .../inverse_text_normalization/vi/taggers/ordinal.py | 3 ++- .../inverse_text_normalization/vi/taggers/punctuation.py | 3 ++- .../inverse_text_normalization/vi/taggers/telephone.py | 3 ++- .../inverse_text_normalization/vi/taggers/time.py | 3 ++- .../vi/taggers/tokenize_and_classify.py | 3 ++- .../inverse_text_normalization/vi/taggers/whitelist.py | 3 ++- .../inverse_text_normalization/vi/taggers/word.py | 3 ++- .../vi/verbalizers/cardinal.py | 3 ++- .../inverse_text_normalization/vi/verbalizers/date.py | 3 ++- .../inverse_text_normalization/vi/verbalizers/decimal.py | 3 ++- .../vi/verbalizers/electronic.py | 3 ++- .../vi/verbalizers/fraction.py | 3 ++- .../inverse_text_normalization/vi/verbalizers/measure.py | 3 ++- .../inverse_text_normalization/vi/verbalizers/money.py | 3 ++- .../inverse_text_normalization/vi/verbalizers/ordinal.py | 3 ++- .../vi/verbalizers/telephone.py | 3 ++- .../inverse_text_normalization/vi/verbalizers/time.py | 3 ++- .../vi/verbalizers/verbalize_final.py | 3 ++- .../vi/verbalizers/whitelist.py | 3 ++- .../inverse_text_normalization/vi/verbalizers/word.py | 3 ++- .../inverse_text_normalization/zh/taggers/cardinal.py | 3 ++- .../inverse_text_normalization/zh/taggers/date.py | 3 ++- .../inverse_text_normalization/zh/taggers/decimal.py | 3 ++- .../inverse_text_normalization/zh/taggers/fraction.py | 3 ++- .../inverse_text_normalization/zh/taggers/money.py | 3 ++- .../inverse_text_normalization/zh/taggers/ordinal.py | 3 ++- .../inverse_text_normalization/zh/taggers/punctuation.py | 3 ++- .../inverse_text_normalization/zh/taggers/time.py | 3 ++- .../zh/taggers/tokenize_and_classify.py | 3 ++- .../inverse_text_normalization/zh/taggers/whitelist.py | 3 ++- .../inverse_text_normalization/zh/taggers/word.py | 3 ++- .../zh/verbalizers/cardinal.py | 3 ++- .../inverse_text_normalization/zh/verbalizers/date.py | 3 ++- .../inverse_text_normalization/zh/verbalizers/decimal.py | 3 ++- .../zh/verbalizers/fraction.py | 3 ++- .../inverse_text_normalization/zh/verbalizers/money.py | 3 ++- .../inverse_text_normalization/zh/verbalizers/ordinal.py | 3 ++- .../inverse_text_normalization/zh/verbalizers/time.py | 3 ++- .../zh/verbalizers/verbalize_final.py | 3 ++- .../zh/verbalizers/whitelist.py | 3 ++- .../inverse_text_normalization/zh/verbalizers/word.py | 3 ++- .../text_normalization/ar/graph_utils.py | 2 +- .../text_normalization/ar/taggers/__init__.py | 1 + .../text_normalization/ar/taggers/cardinal.py | 3 ++- .../text_normalization/ar/taggers/decimal.py | 3 ++- .../text_normalization/ar/taggers/fraction.py | 3 ++- .../text_normalization/ar/taggers/measure.py | 3 ++- .../text_normalization/ar/taggers/money.py | 3 ++- .../ar/taggers/tokenize_and_classify.py | 3 ++- .../text_normalization/ar/taggers/word.py | 3 ++- .../text_normalization/ar/verbalizers/__init__.py | 1 + .../text_normalization/ar/verbalizers/cardinal.py | 3 ++- .../text_normalization/ar/verbalizers/decimal.py | 3 ++- .../text_normalization/ar/verbalizers/fraction.py | 3 ++- .../text_normalization/ar/verbalizers/measure.py | 3 ++- .../text_normalization/ar/verbalizers/money.py | 3 ++- .../text_normalization/ar/verbalizers/verbalize_final.py | 3 ++- .../text_normalization/ar/verbalizers/word.py | 3 ++- .../text_normalization/de/taggers/cardinal.py | 3 ++- .../text_normalization/de/taggers/date.py | 3 ++- .../text_normalization/de/taggers/decimal.py | 3 ++- .../text_normalization/de/taggers/electronic.py | 3 ++- .../text_normalization/de/taggers/fraction.py | 3 ++- .../text_normalization/de/taggers/measure.py | 5 +++-- .../text_normalization/de/taggers/money.py | 3 ++- .../text_normalization/de/taggers/ordinal.py | 3 ++- .../text_normalization/de/taggers/telephone.py | 3 ++- .../text_normalization/de/taggers/time.py | 3 ++- .../de/taggers/tokenize_and_classify.py | 3 ++- .../text_normalization/de/taggers/whitelist.py | 3 ++- .../text_normalization/de/taggers/word.py | 3 ++- .../text_normalization/de/verbalizers/cardinal.py | 3 ++- .../text_normalization/de/verbalizers/date.py | 3 ++- .../text_normalization/de/verbalizers/decimal.py | 3 ++- .../text_normalization/de/verbalizers/electronic.py | 3 ++- .../text_normalization/de/verbalizers/fraction.py | 3 ++- .../text_normalization/de/verbalizers/measure.py | 3 ++- .../text_normalization/de/verbalizers/money.py | 3 ++- .../text_normalization/de/verbalizers/ordinal.py | 3 ++- .../text_normalization/de/verbalizers/telephone.py | 3 ++- .../text_normalization/de/verbalizers/time.py | 3 ++- .../text_normalization/de/verbalizers/verbalize_final.py | 3 ++- .../text_normalization/en/clean_eval_data.py | 2 +- .../text_normalization/en/graph_utils.py | 5 +++-- .../text_normalization/en/taggers/abbreviation.py | 3 ++- .../text_normalization/en/taggers/cardinal.py | 5 +++-- .../text_normalization/en/taggers/date.py | 5 +++-- .../text_normalization/en/taggers/decimal.py | 3 ++- .../text_normalization/en/taggers/electronic.py | 3 ++- .../text_normalization/en/taggers/fraction.py | 3 ++- .../text_normalization/en/taggers/measure.py | 5 +++-- .../text_normalization/en/taggers/money.py | 3 ++- .../text_normalization/en/taggers/ordinal.py | 3 ++- .../text_normalization/en/taggers/punctuation.py | 5 +++-- .../text_normalization/en/taggers/range.py | 3 ++- .../text_normalization/en/taggers/roman.py | 3 ++- .../text_normalization/en/taggers/serial.py | 5 +++-- .../text_normalization/en/taggers/telephone.py | 3 ++- .../text_normalization/en/taggers/time.py | 3 ++- .../en/taggers/tokenize_and_classify.py | 3 ++- .../en/taggers/tokenize_and_classify_lm.py | 5 +++-- .../en/taggers/tokenize_and_classify_with_audio.py | 3 ++- .../text_normalization/en/taggers/whitelist.py | 3 ++- .../text_normalization/en/taggers/word.py | 5 +++-- .../text_normalization/en/verbalizers/abbreviation.py | 3 ++- .../text_normalization/en/verbalizers/cardinal.py | 3 ++- .../text_normalization/en/verbalizers/date.py | 5 +++-- .../text_normalization/en/verbalizers/decimal.py | 3 ++- .../text_normalization/en/verbalizers/electronic.py | 5 +++-- .../text_normalization/en/verbalizers/fraction.py | 5 +++-- .../text_normalization/en/verbalizers/measure.py | 3 ++- .../text_normalization/en/verbalizers/money.py | 3 ++- .../text_normalization/en/verbalizers/ordinal.py | 3 ++- .../text_normalization/en/verbalizers/post_processing.py | 3 ++- .../text_normalization/en/verbalizers/roman.py | 3 ++- .../text_normalization/en/verbalizers/telephone.py | 3 ++- .../text_normalization/en/verbalizers/time.py | 3 ++- .../text_normalization/en/verbalizers/verbalize_final.py | 3 ++- .../text_normalization/en/verbalizers/whitelist.py | 3 ++- .../text_normalization/en/verbalizers/word.py | 3 ++- .../text_normalization/es/graph_utils.py | 3 ++- .../text_normalization/es/taggers/cardinal.py | 3 ++- .../text_normalization/es/taggers/date.py | 3 ++- .../text_normalization/es/taggers/decimals.py | 3 ++- .../text_normalization/es/taggers/electronic.py | 3 ++- .../text_normalization/es/taggers/fraction.py | 3 ++- .../text_normalization/es/taggers/measure.py | 3 ++- .../text_normalization/es/taggers/money.py | 3 ++- .../text_normalization/es/taggers/ordinal.py | 3 ++- .../text_normalization/es/taggers/telephone.py | 3 ++- .../text_normalization/es/taggers/time.py | 3 ++- .../es/taggers/tokenize_and_classify.py | 3 ++- .../text_normalization/es/taggers/whitelist.py | 3 ++- .../text_normalization/es/taggers/word.py | 3 ++- .../text_normalization/es/verbalizers/cardinal.py | 3 ++- .../text_normalization/es/verbalizers/date.py | 3 ++- .../text_normalization/es/verbalizers/decimals.py | 3 ++- .../text_normalization/es/verbalizers/electronic.py | 3 ++- .../text_normalization/es/verbalizers/fraction.py | 3 ++- .../text_normalization/es/verbalizers/measure.py | 3 ++- .../text_normalization/es/verbalizers/money.py | 3 ++- .../text_normalization/es/verbalizers/ordinal.py | 3 ++- .../text_normalization/es/verbalizers/telephone.py | 3 ++- .../text_normalization/es/verbalizers/time.py | 3 ++- .../text_normalization/es/verbalizers/verbalize_final.py | 3 ++- .../text_normalization/fr/taggers/cardinal.py | 3 ++- .../text_normalization/fr/taggers/decimals.py | 3 ++- .../text_normalization/fr/taggers/fraction.py | 3 ++- .../text_normalization/fr/taggers/ordinal.py | 3 ++- .../fr/taggers/tokenize_and_classify.py | 3 ++- .../text_normalization/fr/taggers/whitelist.py | 3 ++- .../text_normalization/fr/taggers/word.py | 3 ++- .../text_normalization/fr/verbalizers/cardinal.py | 3 ++- .../text_normalization/fr/verbalizers/decimals.py | 3 ++- .../text_normalization/fr/verbalizers/fraction.py | 3 ++- .../text_normalization/fr/verbalizers/ordinal.py | 3 ++- .../text_normalization/fr/verbalizers/verbalize_final.py | 3 ++- .../text_normalization/hu/graph_utils.py | 3 ++- .../text_normalization/hu/taggers/cardinal.py | 3 ++- .../text_normalization/hu/taggers/date.py | 3 ++- .../text_normalization/hu/taggers/decimal.py | 3 ++- .../text_normalization/hu/taggers/electronic.py | 3 ++- .../text_normalization/hu/taggers/fraction.py | 3 ++- .../text_normalization/hu/taggers/measure.py | 3 ++- .../text_normalization/hu/taggers/money.py | 3 ++- .../text_normalization/hu/taggers/ordinal.py | 3 ++- .../text_normalization/hu/taggers/telephone.py | 3 ++- .../text_normalization/hu/taggers/time.py | 3 ++- .../hu/taggers/tokenize_and_classify.py | 3 ++- .../text_normalization/hu/taggers/whitelist.py | 3 ++- .../text_normalization/hu/taggers/word.py | 3 ++- .../text_normalization/hu/verbalizers/cardinal.py | 3 ++- .../text_normalization/hu/verbalizers/date.py | 3 ++- .../text_normalization/hu/verbalizers/decimal.py | 3 ++- .../text_normalization/hu/verbalizers/electronic.py | 3 ++- .../text_normalization/hu/verbalizers/fraction.py | 3 ++- .../text_normalization/hu/verbalizers/measure.py | 3 ++- .../text_normalization/hu/verbalizers/money.py | 3 ++- .../text_normalization/hu/verbalizers/ordinal.py | 3 ++- .../text_normalization/hu/verbalizers/telephone.py | 3 ++- .../text_normalization/hu/verbalizers/time.py | 3 ++- .../text_normalization/hu/verbalizers/verbalize_final.py | 3 ++- .../text_normalization/it/taggers/cardinal.py | 3 ++- .../text_normalization/it/taggers/decimals.py | 3 ++- .../text_normalization/it/taggers/electronic.py | 3 ++- .../text_normalization/it/taggers/measure.py | 5 +++-- .../text_normalization/it/taggers/money.py | 3 ++- .../text_normalization/it/taggers/time.py | 3 ++- .../it/taggers/tokenize_and_classify.py | 3 ++- .../text_normalization/it/taggers/whitelist.py | 3 ++- .../text_normalization/it/taggers/word.py | 3 ++- .../text_normalization/it/verbalizers/cardinal.py | 3 ++- .../text_normalization/it/verbalizers/decimal.py | 3 ++- .../text_normalization/it/verbalizers/electronic.py | 3 ++- .../text_normalization/it/verbalizers/measure.py | 3 ++- .../text_normalization/it/verbalizers/money.py | 3 ++- .../text_normalization/it/verbalizers/time.py | 3 ++- .../text_normalization/it/verbalizers/verbalize_final.py | 3 ++- nemo_text_processing/text_normalization/normalize.py | 9 +++++---- .../text_normalization/normalize_with_audio.py | 4 ++-- nemo_text_processing/text_normalization/ru/alphabet.py | 1 + .../text_normalization/ru/taggers/cardinal.py | 3 ++- .../text_normalization/ru/taggers/date.py | 3 ++- .../text_normalization/ru/taggers/decimals.py | 3 ++- .../text_normalization/ru/taggers/electronic.py | 3 ++- .../text_normalization/ru/taggers/measure.py | 3 ++- .../text_normalization/ru/taggers/money.py | 3 ++- .../text_normalization/ru/taggers/number_names.py | 3 ++- .../text_normalization/ru/taggers/ordinal.py | 3 ++- .../text_normalization/ru/taggers/telephone.py | 3 ++- .../text_normalization/ru/taggers/time.py | 3 ++- .../ru/taggers/tokenize_and_classify.py | 3 ++- .../text_normalization/ru/taggers/whitelist.py | 3 ++- .../text_normalization/ru/taggers/word.py | 3 ++- .../text_normalization/ru/verbalizers/cardinal.py | 3 ++- .../text_normalization/ru/verbalizers/date.py | 3 ++- .../text_normalization/ru/verbalizers/decimal.py | 3 ++- .../text_normalization/ru/verbalizers/electronic.py | 3 ++- .../text_normalization/ru/verbalizers/measure.py | 3 ++- .../text_normalization/ru/verbalizers/money.py | 3 ++- .../text_normalization/ru/verbalizers/ordinal.py | 3 ++- .../text_normalization/ru/verbalizers/telephone.py | 3 ++- .../text_normalization/ru/verbalizers/time.py | 3 ++- .../text_normalization/ru/verbalizers/verbalize_final.py | 3 ++- nemo_text_processing/text_normalization/run_evaluate.py | 1 - .../text_normalization/sv/graph_utils.py | 3 ++- .../text_normalization/sv/taggers/abbreviation.py | 3 ++- .../text_normalization/sv/taggers/cardinal.py | 3 ++- .../text_normalization/sv/taggers/date.py | 3 ++- .../text_normalization/sv/taggers/decimal.py | 3 ++- .../text_normalization/sv/taggers/electronic.py | 3 ++- .../text_normalization/sv/taggers/fraction.py | 3 ++- .../text_normalization/sv/taggers/measure.py | 3 ++- .../text_normalization/sv/taggers/money.py | 3 ++- .../text_normalization/sv/taggers/ordinal.py | 3 ++- .../text_normalization/sv/taggers/telephone.py | 3 ++- .../text_normalization/sv/taggers/time.py | 3 ++- .../sv/taggers/tokenize_and_classify.py | 3 ++- .../sv/taggers/tokenize_and_classify_with_audio.py | 3 ++- .../text_normalization/sv/taggers/whitelist.py | 3 ++- .../text_normalization/sv/taggers/word.py | 3 ++- .../text_normalization/sv/verbalizers/cardinal.py | 3 ++- .../text_normalization/sv/verbalizers/date.py | 3 ++- .../text_normalization/sv/verbalizers/decimals.py | 3 ++- .../text_normalization/sv/verbalizers/electronic.py | 3 ++- .../text_normalization/sv/verbalizers/fraction.py | 3 ++- .../text_normalization/sv/verbalizers/measure.py | 3 ++- .../text_normalization/sv/verbalizers/money.py | 3 ++- .../text_normalization/sv/verbalizers/ordinal.py | 3 ++- .../text_normalization/sv/verbalizers/telephone.py | 3 ++- .../text_normalization/sv/verbalizers/time.py | 3 ++- .../text_normalization/sv/verbalizers/verbalize_final.py | 3 ++- .../text_normalization/utils_audio_based.py | 1 + .../text_normalization/zh/graph_utils.py | 3 ++- .../text_normalization/zh/taggers/cardinal.py | 3 ++- .../text_normalization/zh/taggers/date.py | 3 ++- .../text_normalization/zh/taggers/decimal.py | 3 ++- .../text_normalization/zh/taggers/fraction.py | 3 ++- .../text_normalization/zh/taggers/math_symbol.py | 3 ++- .../text_normalization/zh/taggers/measure.py | 3 ++- .../text_normalization/zh/taggers/money.py | 3 ++- .../text_normalization/zh/taggers/ordinal.py | 3 ++- .../text_normalization/zh/taggers/preprocessor.py | 3 ++- .../text_normalization/zh/taggers/punctuation.py | 5 +++-- .../text_normalization/zh/taggers/time.py | 3 ++- .../zh/taggers/tokenize_and_classify.py | 3 ++- .../text_normalization/zh/taggers/whitelist.py | 3 ++- .../text_normalization/zh/taggers/word.py | 3 ++- .../text_normalization/zh/verbalizers/cardinal.py | 3 ++- .../text_normalization/zh/verbalizers/date.py | 3 ++- .../text_normalization/zh/verbalizers/decimal.py | 3 ++- .../text_normalization/zh/verbalizers/fraction.py | 3 ++- .../text_normalization/zh/verbalizers/math_symbol.py | 3 ++- .../text_normalization/zh/verbalizers/measure.py | 3 ++- .../text_normalization/zh/verbalizers/money.py | 3 ++- .../text_normalization/zh/verbalizers/ordinal.py | 3 ++- .../text_normalization/zh/verbalizers/postprocessor.py | 3 ++- .../text_normalization/zh/verbalizers/time.py | 3 ++- .../text_normalization/zh/verbalizers/verbalize.py | 1 + .../text_normalization/zh/verbalizers/verbalize_final.py | 3 ++- .../text_normalization/zh/verbalizers/whitelist.py | 3 ++- .../text_normalization/zh/verbalizers/word.py | 3 ++- requirements/requirements_test.txt | 2 +- setup.cfg | 2 +- tests/conftest.py | 2 +- tests/nemo_text_processing/ar/test_cardinal.py | 3 ++- tests/nemo_text_processing/ar/test_decimal.py | 3 ++- tests/nemo_text_processing/ar/test_fraction.py | 3 ++- tests/nemo_text_processing/ar/test_money.py | 3 ++- tests/nemo_text_processing/ar/test_whitelist.py | 3 ++- .../audio_based_utils/test_audio_based_utils.py | 1 + tests/nemo_text_processing/de/test_cardinal.py | 3 ++- tests/nemo_text_processing/de/test_date.py | 3 ++- tests/nemo_text_processing/de/test_decimal.py | 3 ++- tests/nemo_text_processing/de/test_electronic.py | 3 ++- tests/nemo_text_processing/de/test_fraction.py | 3 ++- tests/nemo_text_processing/de/test_measure.py | 3 ++- tests/nemo_text_processing/de/test_money.py | 3 ++- .../de/test_normalization_with_audio.py | 3 ++- tests/nemo_text_processing/de/test_ordinal.py | 3 ++- tests/nemo_text_processing/de/test_telephone.py | 3 ++- tests/nemo_text_processing/de/test_time.py | 3 ++- tests/nemo_text_processing/de/test_whitelist.py | 3 ++- tests/nemo_text_processing/de/test_word.py | 3 ++- tests/nemo_text_processing/en/test_address.py | 3 ++- tests/nemo_text_processing/en/test_cardinal.py | 3 ++- tests/nemo_text_processing/en/test_date.py | 3 ++- tests/nemo_text_processing/en/test_decimal.py | 3 ++- tests/nemo_text_processing/en/test_electronic.py | 3 ++- tests/nemo_text_processing/en/test_fraction.py | 3 ++- tests/nemo_text_processing/en/test_math.py | 3 ++- tests/nemo_text_processing/en/test_measure.py | 3 ++- tests/nemo_text_processing/en/test_money.py | 3 ++- .../en/test_normalization_with_audio.py | 3 ++- tests/nemo_text_processing/en/test_ordinal.py | 3 ++- tests/nemo_text_processing/en/test_punctuation.py | 3 ++- tests/nemo_text_processing/en/test_range.py | 3 ++- tests/nemo_text_processing/en/test_roman.py | 3 ++- tests/nemo_text_processing/en/test_serial.py | 3 ++- tests/nemo_text_processing/en/test_special_text.py | 3 ++- tests/nemo_text_processing/en/test_telephone.py | 3 ++- tests/nemo_text_processing/en/test_text_split.py | 1 + tests/nemo_text_processing/en/test_time.py | 3 ++- tests/nemo_text_processing/en/test_whitelist.py | 3 ++- tests/nemo_text_processing/en/test_word.py | 3 ++- tests/nemo_text_processing/es/test_cardinal.py | 3 ++- tests/nemo_text_processing/es/test_date.py | 3 ++- tests/nemo_text_processing/es/test_decimal.py | 3 ++- tests/nemo_text_processing/es/test_electronic.py | 3 ++- tests/nemo_text_processing/es/test_fraction.py | 3 ++- tests/nemo_text_processing/es/test_measure.py | 3 ++- tests/nemo_text_processing/es/test_money.py | 3 ++- .../es/test_normalization_with_audio.py | 3 ++- tests/nemo_text_processing/es/test_ordinal.py | 3 ++- tests/nemo_text_processing/es/test_telephone.py | 3 ++- tests/nemo_text_processing/es/test_time.py | 3 ++- tests/nemo_text_processing/es/test_whitelist.py | 3 ++- tests/nemo_text_processing/es/test_word.py | 3 ++- tests/nemo_text_processing/es_en/test_cardinal.py | 3 ++- tests/nemo_text_processing/es_en/test_date.py | 3 ++- tests/nemo_text_processing/es_en/test_decimal.py | 3 ++- tests/nemo_text_processing/es_en/test_electronic.py | 3 ++- tests/nemo_text_processing/es_en/test_fraction.py | 3 ++- tests/nemo_text_processing/es_en/test_measure.py | 3 ++- tests/nemo_text_processing/es_en/test_money.py | 3 ++- tests/nemo_text_processing/es_en/test_ordinal.py | 3 ++- tests/nemo_text_processing/es_en/test_telephone.py | 3 ++- tests/nemo_text_processing/es_en/test_time.py | 3 ++- tests/nemo_text_processing/es_en/test_whitelist.py | 3 ++- tests/nemo_text_processing/es_en/test_word.py | 3 ++- tests/nemo_text_processing/fr/test_cardinal.py | 3 ++- tests/nemo_text_processing/fr/test_date.py | 3 ++- tests/nemo_text_processing/fr/test_decimal.py | 3 ++- tests/nemo_text_processing/fr/test_electronic.py | 3 ++- tests/nemo_text_processing/fr/test_fraction.py | 3 ++- tests/nemo_text_processing/fr/test_measure.py | 3 ++- tests/nemo_text_processing/fr/test_money.py | 3 ++- tests/nemo_text_processing/fr/test_ordinal.py | 3 ++- tests/nemo_text_processing/fr/test_telephone.py | 3 ++- tests/nemo_text_processing/fr/test_time.py | 3 ++- tests/nemo_text_processing/fr/test_whitelist.py | 3 ++- tests/nemo_text_processing/fr/test_word.py | 3 ++- tests/nemo_text_processing/hu/test_cardinal.py | 3 ++- tests/nemo_text_processing/hu/test_date.py | 3 ++- tests/nemo_text_processing/hu/test_decimal.py | 3 ++- tests/nemo_text_processing/hu/test_electronic.py | 3 ++- tests/nemo_text_processing/hu/test_fraction.py | 3 ++- tests/nemo_text_processing/hu/test_measure.py | 3 ++- tests/nemo_text_processing/hu/test_money.py | 3 ++- tests/nemo_text_processing/hu/test_ordinal.py | 3 ++- tests/nemo_text_processing/hu/test_telephone.py | 3 ++- tests/nemo_text_processing/hu/test_time.py | 3 ++- tests/nemo_text_processing/hu/test_whitelist.py | 3 ++- tests/nemo_text_processing/hu/test_word.py | 3 ++- tests/nemo_text_processing/it/test_cardinal.py | 3 ++- tests/nemo_text_processing/it/test_decimal.py | 3 ++- tests/nemo_text_processing/it/test_electronic.py | 3 ++- tests/nemo_text_processing/it/test_measure.py | 3 ++- tests/nemo_text_processing/it/test_money.py | 3 ++- tests/nemo_text_processing/it/test_time.py | 3 ++- tests/nemo_text_processing/it/test_whitelist.py | 3 ++- tests/nemo_text_processing/pt/test_cardinal.py | 3 ++- tests/nemo_text_processing/pt/test_date.py | 3 ++- tests/nemo_text_processing/pt/test_decimal.py | 3 ++- tests/nemo_text_processing/pt/test_electronic.py | 3 ++- tests/nemo_text_processing/pt/test_measure.py | 3 ++- tests/nemo_text_processing/pt/test_money.py | 3 ++- tests/nemo_text_processing/pt/test_ordinal.py | 3 ++- tests/nemo_text_processing/pt/test_telephone.py | 3 ++- tests/nemo_text_processing/pt/test_time.py | 3 ++- tests/nemo_text_processing/pt/test_whitelist.py | 3 ++- tests/nemo_text_processing/pt/test_word.py | 3 ++- tests/nemo_text_processing/sv/test_cardinal.py | 3 ++- tests/nemo_text_processing/sv/test_date.py | 3 ++- tests/nemo_text_processing/sv/test_decimal.py | 3 ++- tests/nemo_text_processing/sv/test_electronic.py | 3 ++- tests/nemo_text_processing/sv/test_fraction.py | 3 ++- tests/nemo_text_processing/sv/test_measure.py | 3 ++- tests/nemo_text_processing/sv/test_money.py | 3 ++- .../sv/test_normalization_with_audio.py | 3 ++- tests/nemo_text_processing/sv/test_ordinal.py | 3 ++- tests/nemo_text_processing/sv/test_telephone.py | 3 ++- tests/nemo_text_processing/sv/test_time.py | 3 ++- tests/nemo_text_processing/sv/test_whitelist.py | 3 ++- tests/nemo_text_processing/sv/test_word.py | 3 ++- tests/nemo_text_processing/zh/test_cardinal.py | 3 ++- tests/nemo_text_processing/zh/test_date.py | 3 ++- tests/nemo_text_processing/zh/test_decimal.py | 3 ++- tests/nemo_text_processing/zh/test_fraction.py | 3 ++- tests/nemo_text_processing/zh/test_math.py | 3 ++- tests/nemo_text_processing/zh/test_measure.py | 3 ++- tests/nemo_text_processing/zh/test_money.py | 3 ++- tests/nemo_text_processing/zh/test_ordinal.py | 3 ++- tests/nemo_text_processing/zh/test_preprocess.py | 3 ++- tests/nemo_text_processing/zh/test_time.py | 3 ++- tests/nemo_text_processing/zh/test_whitelist.py | 3 ++- tests/nemo_text_processing/zh/test_word.py | 3 ++- tools/text_processing_deployment/pynini_export.py | 1 + 615 files changed, 1227 insertions(+), 626 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bddd9abc3..b4317f989 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,7 +30,7 @@ repos: - id: requirements-txt-fixer - repo: https://github.com/PyCQA/isort - rev: 4.3.21 + rev: 5.12.0 hooks: - id: isort name: Format imports diff --git a/nemo_text_processing/fst_alignment/alignment.py b/nemo_text_processing/fst_alignment/alignment.py index 007d6d9c9..f900923fd 100644 --- a/nemo_text_processing/fst_alignment/alignment.py +++ b/nemo_text_processing/fst_alignment/alignment.py @@ -21,7 +21,6 @@ import pynini from pynini import Far - """ This files takes 1. Far file containing a fst graph created by TN or ITN 2. entire string. Optionally: 3. start position of substring 4. end (exclusive) position of substring diff --git a/nemo_text_processing/hybrid/model_utils.py b/nemo_text_processing/hybrid/model_utils.py index f40b56ba0..9dc2314a7 100644 --- a/nemo_text_processing/hybrid/model_utils.py +++ b/nemo_text_processing/hybrid/model_utils.py @@ -17,9 +17,10 @@ import re from typing import List, Union -from nemo_text_processing.hybrid.mlm_scorer import MLMScorer from tqdm import tqdm +from nemo_text_processing.hybrid.mlm_scorer import MLMScorer + try: import torch except ImportError as e: diff --git a/nemo_text_processing/hybrid/utils.py b/nemo_text_processing/hybrid/utils.py index 699a5b647..ced823510 100644 --- a/nemo_text_processing/hybrid/utils.py +++ b/nemo_text_processing/hybrid/utils.py @@ -23,11 +23,12 @@ import pandas as pd import pynini -from nemo_text_processing.inverse_text_normalization.en.taggers.cardinal import CardinalFst -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from pynini.lib.rewrite import top_rewrite from tqdm import tqdm +from nemo_text_processing.inverse_text_normalization.en.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + DELIMITER = '~~' cardinal_graph = CardinalFst(input_case="cased").graph_no_exception diff --git a/nemo_text_processing/hybrid/wfst_lm_rescoring.py b/nemo_text_processing/hybrid/wfst_lm_rescoring.py index e3c0bd02d..86f375058 100644 --- a/nemo_text_processing/hybrid/wfst_lm_rescoring.py +++ b/nemo_text_processing/hybrid/wfst_lm_rescoring.py @@ -25,9 +25,10 @@ import pandas as pd import utils from joblib import Parallel, delayed -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from tqdm import tqdm +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio + parser = argparse.ArgumentParser(description="Re-scoring") parser.add_argument("--lang", default="en", type=str, choices=["en"]) parser.add_argument("--n_tagged", default=100, type=int, help="Number WFST options") diff --git a/nemo_text_processing/inverse_text_normalization/ar/__init__.py b/nemo_text_processing/inverse_text_normalization/ar/__init__.py index 55b8c44a9..e56a8f9af 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ar/__init__.py @@ -13,8 +13,9 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/__init__.py index 55b8c44a9..e56a8f9af 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/__init__.py @@ -13,8 +13,9 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py index a9a6869bf..40ee1acf0 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_SPACE, GraphFst + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py index fa14c4629..f0d641d14 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.ar.graph_utils import ( NEMO_SPACE, GraphFst, delete_extra_space, insert_space, ) -from pynini.lib import pynutil class DecimalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/fraction.py index be18ac16f..beefe52ee 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/fraction.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.ar.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, @@ -21,7 +23,6 @@ insert_space, ) from nemo_text_processing.text_normalization.ar.utils import get_abs_path -from pynini.lib import pynutil class FractionFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/measure.py index ac3fc7fbe..2f047240e 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/measure.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.ar.graph_utils import GraphFst, convert_space, delete_extra_space from nemo_text_processing.text_normalization.ar.taggers.measure import unit_singular -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/money.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/money.py index e13b5a45a..50eaa5d6f 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/money.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.ar.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, @@ -23,7 +25,6 @@ insert_space, ) from nemo_text_processing.text_normalization.ar.taggers.money import ar_cur, maj_singular, min_plural, min_singular -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/punctuation.py index 798546ccf..5fe5a8411 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/punctuation.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini -from nemo_text_processing.text_normalization.ar.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.ar.graph_utils import GraphFst + class PunctuationFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/tokenize_and_classify.py index 9cc5f8eed..97818c7bb 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.ar.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ar.taggers.decimal import DecimalFst from nemo_text_processing.inverse_text_normalization.ar.taggers.fraction import FractionFst @@ -31,7 +33,6 @@ from nemo_text_processing.text_normalization.ar.taggers.tokenize_and_classify import ClassifyFst as TNClassifyFst from nemo_text_processing.text_normalization.en.graph_utils import INPUT_LOWER_CASED from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/word.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/word.py index bf9095f6b..83ec8f76a 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/word.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini -from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_NOT_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_NOT_SPACE, GraphFst + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/__init__.py index 67e5340b8..8944d9e6c 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/__init__.py @@ -13,6 +13,7 @@ # limitations under the License. import pynini + from nemo_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/cardinal.py index 5ced7907d..2145bd935 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/cardinal.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/decimal.py index 0b5f5ed2c..842f6e3f5 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class DecimalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/fraction.py index a36533bee..3a7f885d5 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/fraction.py @@ -14,6 +14,8 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.ar.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SPACE, @@ -21,7 +23,6 @@ delete_space, delete_zero_or_one_space, ) -from pynini.lib import pynutil class FractionFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/measure.py index 78299091b..f2eeee5ce 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/measure.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_CHAR, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_CHAR, GraphFst, delete_space + class MeasureFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/money.py index 084e740d4..51d5de412 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/money.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_CHAR, NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_CHAR, GraphFst, delete_space + class MoneyFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/verbalize_final.py index 17aa67826..326d49df8 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/verbalize_final.py @@ -12,10 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.ar.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.ar.verbalizers.word import WordFst from nemo_text_processing.text_normalization.ar.graph_utils import GraphFst, delete_extra_space, delete_space -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/word.py index 11da24fab..434ffcc6a 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/word.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/de/taggers/cardinal.py index e0b066173..0670090b8 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/date.py b/nemo_text_processing/inverse_text_normalization/de/taggers/date.py index b65be41e4..916b9413d 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/date.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_NOT_QUOTE, @@ -20,7 +22,6 @@ GraphFst, convert_space, ) -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/de/taggers/decimal.py index e6d562d4b..53354e3ed 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.taggers.decimal import get_quantity, quantities from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst -from pynini.lib import pynutil class DecimalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/de/taggers/electronic.py index 849d6c843..38ca80ca5 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + class ElectronicFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/de/taggers/fraction.py index 6960a6663..14e06a5be 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/fraction.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, convert_space, delete_space, ) -from pynini.lib import pynutil class FractionFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/de/taggers/measure.py index 86eff3ff2..dbfff31b4 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/measure.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.taggers.measure import singular_to_plural, unit_singular from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_SIGMA, @@ -21,7 +23,6 @@ delete_extra_space, delete_space, ) -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/money.py b/nemo_text_processing/inverse_text_normalization/de/taggers/money.py index dad06de1c..c53b865ed 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/money.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.taggers.money import maj_singular, min_plural, min_singular from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, @@ -23,7 +25,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/de/taggers/ordinal.py index 300dd9010..52b8ffa0b 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/ordinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class OrdinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/de/taggers/telephone.py index ddc3391d4..22474376f 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/telephone.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space, insert_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space, insert_space + class TelephoneFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/time.py b/nemo_text_processing/inverse_text_normalization/de/taggers/time.py index c747202b9..571edd724 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/time.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst + class TimeFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/de/taggers/tokenize_and_classify.py index 4b965f86d..1d60d071a 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.de.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.de.taggers.date import DateFst from nemo_text_processing.inverse_text_normalization.de.taggers.decimal import DecimalFst @@ -46,7 +48,6 @@ generator_main, ) from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/de/taggers/whitelist.py index 18c6e0502..c99d79c30 100644 --- a/nemo_text_processing/inverse_text_normalization/de/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/de/taggers/whitelist.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space + class WhiteListFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/de/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/de/verbalizers/cardinal.py index 92a83625a..b13382a8e 100644 --- a/nemo_text_processing/inverse_text_normalization/de/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/de/verbalizers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/de/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/de/verbalizers/decimal.py index ff3839533..8c4ac951e 100644 --- a/nemo_text_processing/inverse_text_normalization/de/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/de/verbalizers/decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_preserve_order from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_preserve_order + class DecimalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/de/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/de/verbalizers/measure.py index 90f43e7e6..14afbbee5 100644 --- a/nemo_text_processing/inverse_text_normalization/de/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/de/verbalizers/measure.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space + class MeasureFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/de/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/de/verbalizers/money.py index f0377d4fa..f8bbc0793 100644 --- a/nemo_text_processing/inverse_text_normalization/de/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/de/verbalizers/money.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space + class MoneyFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/de/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/de/verbalizers/time.py index 13a013db5..3031ac2b4 100644 --- a/nemo_text_processing/inverse_text_normalization/de/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/de/verbalizers/time.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, delete_space + class TimeFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/de/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/de/verbalizers/verbalize_final.py index 779a00a42..ab2576934 100644 --- a/nemo_text_processing/inverse_text_normalization/de/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/de/verbalizers/verbalize_final.py @@ -13,10 +13,11 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.de.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/en/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/en/clean_eval_data.py index d9bc2fccb..ab2969f98 100644 --- a/nemo_text_processing/inverse_text_normalization/en/clean_eval_data.py +++ b/nemo_text_processing/inverse_text_normalization/en/clean_eval_data.py @@ -16,6 +16,7 @@ from typing import List import regex as re + from nemo_text_processing.text_normalization.data_loader_utils import ( EOS_TYPE, Instance, @@ -23,7 +24,6 @@ training_data_to_sentences, ) - """ This file is for evaluation purposes. filter_loaded_data() cleans data (list of instances) for inverse text normalization. Filters and cleaners can be specified for each semiotic class individually. diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py index 8eb98bd5d..36f424208 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py @@ -14,6 +14,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.utils import get_abs_path, num_to_word from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, @@ -27,7 +29,6 @@ capitalized_input_graph, delete_space, ) -from pynini.lib import pynutil class CardinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/date.py b/nemo_text_processing/inverse_text_normalization/en/taggers/date.py index fadde9cbd..8d8a4f444 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/date.py @@ -14,6 +14,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, @@ -25,7 +27,6 @@ delete_extra_space, delete_space, ) -from pynini.lib import pynutil graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")).optimize() graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py index f6a730ad6..2c6ee7a62 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py @@ -14,6 +14,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, @@ -29,7 +31,6 @@ delete_space, ) from nemo_text_processing.text_normalization.en.utils import load_labels -from pynini.lib import pynutil def get_quantity( diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py index 2a2634cf4..a2373d9d7 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py @@ -14,6 +14,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.utils import get_abs_path, get_various_formats from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, @@ -25,7 +27,6 @@ insert_space, ) from nemo_text_processing.text_normalization.en.utils import load_labels -from pynini.lib import pynutil class ElectronicFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py index dcb37e5fc..2d9d5e02c 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py @@ -14,6 +14,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, @@ -27,7 +29,6 @@ delete_space, get_singulars, ) -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/money.py b/nemo_text_processing/inverse_text_normalization/en/taggers/money.py index 5b08491c7..2a1e32a49 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/money.py @@ -14,6 +14,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, @@ -29,7 +31,6 @@ get_singulars, insert_space, ) -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/en/taggers/ordinal.py index a60f704f2..632e03685 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/ordinal.py @@ -14,6 +14,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, @@ -22,7 +24,6 @@ GraphFst, capitalized_input_graph, ) -from pynini.lib import pynutil class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/en/taggers/punctuation.py index 10de67cdb..f53037da0 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/punctuation.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + class PunctuationFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py index 8d0f67028..dba4c0201 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py @@ -14,6 +14,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, @@ -28,7 +30,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil def get_serial_number(cardinal): diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/time.py b/nemo_text_processing/inverse_text_normalization/en/taggers/time.py index 995745331..53d3dd931 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/time.py @@ -15,6 +15,8 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.en.utils import get_abs_path, num_to_word from nemo_text_processing.text_normalization.en.graph_utils import ( @@ -27,7 +29,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/en/taggers/tokenize_and_classify.py index 8d2908edb..365257b6c 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/tokenize_and_classify.py @@ -16,6 +16,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.en.taggers.date import DateFst from nemo_text_processing.inverse_text_normalization.en.taggers.decimal import DecimalFst @@ -36,7 +38,6 @@ generator_main, ) from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/en/taggers/whitelist.py index 26ef5501b..f0f20cba7 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/whitelist.py @@ -17,6 +17,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, @@ -26,7 +28,6 @@ string_map_cased, ) from nemo_text_processing.text_normalization.en.utils import load_labels -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/word.py b/nemo_text_processing/inverse_text_normalization/en/taggers/word.py index 714e68efe..2c5fdc8c2 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/word.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/cardinal.py index 8d9c3a621..d9ffd5e99 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/date.py index 75a6e0d09..116bd445b 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/date.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, delete_extra_space, delete_space, ) -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/decimal.py index 41b3a281a..60cdc138b 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class DecimalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/electronic.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/electronic.py index 4c9951c95..d390868ac 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class ElectronicFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/measure.py index c5a761761..3ec608881 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/measure.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space + class MeasureFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/money.py index 7218257f7..a4f7c7594 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/money.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space + class MoneyFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/ordinal.py index 378939812..2f0a9453b 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/ordinal.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_space + class OrdinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/telephone.py index 6c37ba468..e8d622e3c 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/telephone.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class TelephoneFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/time.py index 5a2c87093..061334796 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/time.py @@ -14,6 +14,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, @@ -21,7 +23,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/verbalize_final.py index a8ea18700..467329001 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/verbalize_final.py @@ -14,10 +14,11 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/whitelist.py index 67ffe4c76..7e1148909 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/whitelist.py @@ -15,9 +15,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + class WhiteListFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/word.py index 6e94ac8c8..70614fd49 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/word.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/graph_utils.py b/nemo_text_processing/inverse_text_normalization/es/graph_utils.py index 164ebcdbb..0396b5b4c 100644 --- a/nemo_text_processing/inverse_text_normalization/es/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/es/graph_utils.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.es.utils import get_abs_path from pynini.lib import pynutil +from nemo_text_processing.text_normalization.es.utils import get_abs_path + def int_to_roman(fst: 'pynini.FstLike') -> 'pynini.FstLike': """ diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py index 07a1e8316..085b6bff1 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, NEMO_SPACE, GraphFst, delete_space -from pynini.lib import pynutil class CardinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/date.py b/nemo_text_processing/inverse_text_normalization/es/taggers/date.py index 3100e6a50..c4320825c 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/date.py @@ -13,10 +13,11 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.graph_utils import int_to_roman from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py index bdbf18049..b98c5b1e2 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, @@ -20,7 +22,6 @@ delete_extra_space, delete_space, ) -from pynini.lib import pynutil def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_million: 'pynini.FstLike') -> 'pynini.FstLike': diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py index 53b6b4d09..98d8f60de 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, GraphFst, insert_space -from pynini.lib import pynutil class ElectronicFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py index f31d984eb..4fcf63706 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, NEMO_SPACE, GraphFst -from pynini.lib import pynutil class FractionFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py index 6aea36ede..68770a05c 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, @@ -22,7 +24,6 @@ delete_extra_space, delete_space, ) -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/money.py b/nemo_text_processing/inverse_text_normalization/es/taggers/money.py index 357cc8e08..3caea6f41 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/money.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, @@ -23,7 +25,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py index 207bbbaf6..9b4ffaac8 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst, delete_space -from pynini.lib import pynutil class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/es/taggers/punctuation.py index 12405d5c5..c34f732ed 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/punctuation.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + class PunctuationFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py index 0f6b5f003..5043443c4 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_space -from pynini.lib import pynutil class TelephoneFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py index 6e57b5a77..a45432085 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py @@ -14,6 +14,8 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( GraphFst, @@ -22,7 +24,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py index 315c14c9d..bed7ad019 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.es.taggers.date import DateFst from nemo_text_processing.inverse_text_normalization.es.taggers.decimal import DecimalFst @@ -36,7 +38,6 @@ generator_main, ) from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py index 2d31cede5..7b9159724 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/word.py b/nemo_text_processing/inverse_text_normalization/es/taggers/word.py index 57f143d0d..b09f941e2 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/word.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/cardinal.py index e6737be6e..46fca2a1c 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py index 234fdc296..70cb9349f 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, @@ -20,7 +22,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/decimal.py index b443733a2..c9b060ec1 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class DecimalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/electronic.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/electronic.py index cf6bdc779..069716123 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class ElectronicFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/fraction.py index eae72ab4b..e553d95e9 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/fraction.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space + class FractionFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/measure.py index 0bd8f7460..6162f0c20 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/measure.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, GraphFst, delete_extra_space, delete_space, ) -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/money.py index a52371f1e..60e9b7aeb 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/money.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space + class MoneyFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/ordinal.py index 8cfb15095..c1c9bdb46 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/ordinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class OrdinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py index bc32f62fe..58aa190ba 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class TelephoneFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py index 15b7ddbf1..e2f185f56 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, @@ -20,7 +22,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py index 26d09996b..6b22d6f73 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py @@ -13,10 +13,11 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.es.verbalizers.word import WordFst from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py index cc231a46b..606a4e569 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + class WhiteListFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/word.py index 3a5ba96b9..8c0bd08b1 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/word.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es_en/graph_utils.py b/nemo_text_processing/inverse_text_normalization/es_en/graph_utils.py index 164ebcdbb..0396b5b4c 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/graph_utils.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.es.utils import get_abs_path from pynini.lib import pynutil +from nemo_text_processing.text_normalization.es.utils import get_abs_path + def int_to_roman(fst: 'pynini.FstLike') -> 'pynini.FstLike': """ diff --git a/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py index abf54edb6..ebf2a2a2e 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.taggers.cardinal import CardinalFst as EnCardinalFst from nemo_text_processing.inverse_text_normalization.en.taggers.date import DateFst as EnDateFst from nemo_text_processing.inverse_text_normalization.en.taggers.decimal import DecimalFst as EnDecimalFst @@ -49,7 +51,6 @@ generator_main, ) from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py index 8e540a616..fd0955994 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.verbalizers.cardinal import CardinalFst as EnCardinalFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.date import DateFst as EnDateFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.decimal import DecimalFst as EnDecimalFst @@ -34,7 +36,6 @@ from nemo_text_processing.inverse_text_normalization.es.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.es.verbalizers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.en.graph_utils import GraphFst -from pynini.lib import pynutil class VerbalizeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py index 65d9b91c4..3323f173b 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py @@ -13,10 +13,11 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.verbalizers.word import WordFst from nemo_text_processing.inverse_text_normalization.es_en.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/graph_utils.py b/nemo_text_processing/inverse_text_normalization/fr/graph_utils.py index 1e2135e40..36eccd14b 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/fr/graph_utils.py @@ -19,13 +19,14 @@ from typing import Dict import pynini -from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path -from nemo_text_processing.utils.logging import logger from pynini import Far from pynini.examples import plurals from pynini.export import export from pynini.lib import byte, pynutil, utf8 +from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path +from nemo_text_processing.utils.logging import logger + NEMO_CHAR = utf8.VALID_UTF8_CHAR NEMO_DIGIT = byte.DIGIT diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py index 2c8fd6c0c..333460eb0 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, @@ -23,7 +25,6 @@ delete_hyphen, ) from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil def rewrite(cardinal: 'pynini.FstLike') -> 'pynini.FstLike': diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py index 1412111b4..06807f6a3 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import GraphFst, delete_extra_space from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/decimal.py index 035d70fab..7994b719d 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/decimal.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import ( NEMO_DIGIT, GraphFst, @@ -21,7 +23,6 @@ delete_space, ) from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_thousand: 'pynini.FstLike') -> 'pynini.FstLike': diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/electronic.py index 15d826650..36e54fad4 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_ALPHA, GraphFst, insert_space from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil class ElectronicFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/fraction.py index e05323724..ca089455a 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/fraction.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import ( NEMO_CHAR, GraphFst, @@ -20,7 +22,6 @@ delete_space, ) from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil class FractionFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/measure.py index d9636286c..0bd3298c3 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/measure.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import ( GraphFst, delete_extra_space, @@ -20,7 +22,6 @@ get_singulars, ) from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/money.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/money.py index 17f473aa2..94c0dd2eb 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/money.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import ( NEMO_DIGIT, GraphFst, @@ -20,7 +22,6 @@ delete_space, ) from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/ordinal.py index 29da19b85..03976e9e9 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/ordinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_SIGMA, GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/punctuation.py index 458497aa6..e6c833db3 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/punctuation.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.fr.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.fr.graph_utils import GraphFst + class PunctuationFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/telephone.py index b81cc0dfd..b157960c0 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/telephone.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import ( GraphFst, delete_hyphen, @@ -20,7 +22,6 @@ insert_space, ) from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil class TelephoneFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/time.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/time.py index 499701a1e..5113d50e8 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/time.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/tokenize_and_classify.py index 47b6feb5e..88774456e 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import ( GraphFst, delete_extra_space, @@ -36,7 +38,6 @@ from nemo_text_processing.inverse_text_normalization.fr.taggers.word import WordFst from nemo_text_processing.text_normalization.en.graph_utils import INPUT_LOWER_CASED from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/whitelist.py index 363e64bfa..722dc84b4 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/whitelist.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import GraphFst, convert_space from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/word.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/word.py index 95d05ce6a..21b852910 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/word.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_NOT_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_NOT_SPACE, GraphFst + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/cardinal.py index 4aca2bac2..bc4e76fa2 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/date.py index 47304c1e1..62304b0d2 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/date.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, delete_extra_space, delete_space, ) -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py index dc00842cd..c1a55401e 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import ( NEMO_DIGIT, NEMO_NON_BREAKING_SPACE, @@ -20,7 +22,6 @@ GraphFst, delete_space, ) -from pynini.lib import pynutil class NumberParser(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/electronic.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/electronic.py index e4cc75cbd..72113be63 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class ElectronicFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/fraction.py index dce25d949..f7d163f23 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/fraction.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space, ) -from pynini.lib import pynutil class FractionFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/measure.py index ad0f2f98c..8b49c0eb1 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/measure.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_CHAR, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_CHAR, GraphFst, delete_space + class MeasureFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/money.py index db4725684..15dc72c94 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/money.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, delete_extra_space, delete_space, ) -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py index 96d8c2403..77dd6323f 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import ( NEMO_DIGIT, NEMO_NOT_QUOTE, @@ -20,7 +22,6 @@ delete_space, ) from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/telephone.py index c204bcc38..d937c04d7 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/telephone.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_NOT_QUOTE, GraphFst + class TelephoneFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/time.py index 93e4886e5..52af95d09 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/time.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import ( NEMO_DIGIT, GraphFst, @@ -20,7 +22,6 @@ delete_space, ) from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/verbalize_final.py index 9753b7855..c0bf305da 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/verbalize_final.py @@ -13,10 +13,11 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import GraphFst, delete_extra_space, delete_space from nemo_text_processing.inverse_text_normalization.fr.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.fr.verbalizers.word import WordFst -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/whitelist.py index 6b350062d..00327a416 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/whitelist.py @@ -14,13 +14,14 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import ( NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space, ) -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/word.py index fa8f0f39d..6c510af08 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/word.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.fr.graph_utils import ( NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space, ) -from pynini.lib import pynutil class WordFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py index 5ff16a6e0..8eeea3876 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py @@ -14,6 +14,8 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, @@ -24,7 +26,6 @@ GraphFst, delete_space, ) -from pynini.lib import pynutil class CardinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/date.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/date.py index 4b891ea98..2cd22e5ca 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/date.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/decimal.py index dab779965..24981ed7e 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/decimal.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, @@ -20,7 +22,6 @@ delete_extra_space, delete_space, ) -from pynini.lib import pynutil def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_million: 'pynini.FstLike') -> 'pynini.FstLike': diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/electronic.py index aa152b116..4f3fe1fc8 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, GraphFst, insert_space -from pynini.lib import pynutil class ElectronicFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/measure.py index 7b6f1015a..13b9ffef3 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/measure.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_SIGMA, @@ -21,7 +23,6 @@ delete_extra_space, delete_space, ) -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/money.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/money.py index cc3639438..5970da8de 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/money.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, @@ -23,7 +25,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/ordinal.py index ff7f3fbf0..dae122abc 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/ordinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst, delete_space -from pynini.lib import pynutil class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/punctuation.py index cb5285452..3c23bad1a 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/punctuation.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + class PunctuationFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/telephone.py index a1ad2d075..d048cc36c 100755 --- a/nemo_text_processing/inverse_text_normalization/pt/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/telephone.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_space, insert_space -from pynini.lib import pynutil class TelephoneFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/time.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/time.py index 1e209ec96..df2a6e69e 100755 --- a/nemo_text_processing/inverse_text_normalization/pt/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/time.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_space, insert_space -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/tokenize_and_classify.py index a30ccc850..94282af34 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.pt.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.pt.taggers.date import DateFst from nemo_text_processing.inverse_text_normalization.pt.taggers.decimal import DecimalFst @@ -35,7 +37,6 @@ generator_main, ) from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/whitelist.py index 5f6491893..3377f39a9 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/whitelist.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/word.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/word.py index 7908397d5..1544fb351 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/word.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/cardinal.py index 928a259d3..56c82501f 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/date.py index 7c0034099..8faf99407 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/date.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, @@ -20,7 +22,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/decimal.py index 58fc76ea6..60a61e0ee 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class DecimalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/electronic.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/electronic.py index 11b2706a3..203a44682 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class ElectronicFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/measure.py index 057ade696..77d62d93f 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/measure.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space + class MeasureFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/money.py index 54a9b1038..755361bda 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/money.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space, insert_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space, insert_space + class MoneyFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/ordinal.py index fe3454e15..25d8595df 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/ordinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class OrdinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/telephone.py index 4dd0d7079..1423f33e8 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/telephone.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class TelephoneFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/time.py index b1a04c673..4bdfe0fca 100755 --- a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/time.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_NOT_QUOTE, @@ -20,7 +22,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/verbalize_final.py index cc2e65aed..5e7248bac 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/verbalize_final.py @@ -13,10 +13,11 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.pt.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.pt.verbalizers.word import WordFst from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/whitelist.py index f54aaea65..75c80c383 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/whitelist.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + class WhiteListFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/word.py index 4417d8f00..16c38ee05 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/word.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ru/taggers/cardinal.py index e812be117..20bab26f2 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ru/taggers/cardinal.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, insert_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, insert_space + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/taggers/date.py b/nemo_text_processing/inverse_text_normalization/ru/taggers/date.py index 9727626c8..8e262891b 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/ru/taggers/date.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + class DateFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/taggers/decimals.py b/nemo_text_processing/inverse_text_normalization/ru/taggers/decimals.py index 66e8f5f73..aa6c784b3 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/taggers/decimals.py +++ b/nemo_text_processing/inverse_text_normalization/ru/taggers/decimals.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SPACE, GraphFst, delete_extra_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SPACE, GraphFst, delete_extra_space + class DecimalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/ru/taggers/electronic.py index f73c8f1b8..62712c054 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/ru/taggers/electronic.py @@ -13,9 +13,10 @@ # limitations under the License. -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + class ElectronicFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/ru/taggers/measure.py index ef15a61e2..d083d899b 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/ru/taggers/measure.py @@ -13,9 +13,10 @@ # limitations under the License. -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + class MeasureFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/taggers/money.py b/nemo_text_processing/inverse_text_normalization/ru/taggers/money.py index 7477350ec..0185fed32 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/ru/taggers/money.py @@ -13,9 +13,10 @@ # limitations under the License. -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + class MoneyFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ru/taggers/ordinal.py index 306054d63..d0d62f193 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/ru/taggers/ordinal.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst + class OrdinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/ru/taggers/telephone.py index 7b54ddc7e..61bbbad20 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/ru/taggers/telephone.py @@ -13,9 +13,10 @@ # limitations under the License. -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + class TelephoneFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/taggers/time.py b/nemo_text_processing/inverse_text_normalization/ru/taggers/time.py index 50e12c09a..01e12453c 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ru/taggers/time.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SPACE, GraphFst from nemo_text_processing.text_normalization.ru.verbalizers.time import TimeFst as TNTimeVerbalizer -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/ru/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ru/taggers/tokenize_and_classify.py index 62605ec53..75469203d 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ru/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.taggers.punctuation import PunctuationFst from nemo_text_processing.inverse_text_normalization.en.taggers.word import WordFst from nemo_text_processing.inverse_text_normalization.ru.taggers.cardinal import CardinalFst @@ -36,7 +38,6 @@ ) from nemo_text_processing.text_normalization.ru.taggers.tokenize_and_classify import ClassifyFst as TNClassifyFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/ru/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/ru/taggers/whitelist.py index 45edf1592..2c44cdac7 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/ru/taggers/whitelist.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space from nemo_text_processing.text_normalization.ru.utils import get_abs_path -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/cardinal.py index 12e2451f7..fa76f2fbc 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/date.py index 02cce48da..8ccf4c887 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/date.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class DateFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/decimal.py index cf8cefc5d..62da0ddb5 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space + class DecimalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/electronic.py b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/electronic.py index 158cd9893..0b560b82e 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class ElectronicFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/measure.py index 3cae67bac..5ba92e6b0 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/measure.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class MeasureFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/money.py index af59d33db..9c8a44b00 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/money.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class MoneyFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/ordinal.py index 31ff9b3c6..3627ede9a 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/ordinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class OrdinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/telephone.py index 06c705dc4..47e1c2754 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/telephone.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class TelephoneFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/time.py index edfe7b898..addd809eb 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/time.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class TimeFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/verbalize_final.py index f409537d2..bfa68b8a6 100644 --- a/nemo_text_processing/inverse_text_normalization/ru/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ru/verbalizers/verbalize_final.py @@ -13,10 +13,11 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.inverse_text_normalization.ru.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index d6ed9fb9b..4cc178ce1 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -23,7 +23,6 @@ training_data_to_tokens, ) - ''' Runs Evaluation on data in the format of : \t\t<`self` if trivial class or normalized text> like the Google text normalization data https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/cardinal.py index 07c1cd9b6..d08b39589 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py index eb10d0753..5bb6c63bc 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, NEMO_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, NEMO_SPACE, GraphFst + class DateFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py index da3d7e493..e39a9017a 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst from nemo_text_processing.text_normalization.sv.taggers.decimal import get_quantity -from pynini.lib import pynutil class DecimalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/electronic.py index 526d45070..c1c2bc2a3 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + class ElectronicFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/fraction.py index 8b3d60c04..2ba361280 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/fraction.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SPACE, GraphFst, convert_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SPACE, GraphFst, convert_space + class FractionFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/ordinal.py index 3f97c7ef5..bc91a7427 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/ordinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + class OrdinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/telephone.py index 7436407b3..7c319e0f3 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/telephone.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SPACE, GraphFst, convert_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SPACE, GraphFst, convert_space + class TelephoneFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/time.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/time.py index 77d59aacb..cf8fdc202 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/time.py @@ -15,11 +15,12 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.sv.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SPACE, GraphFst from nemo_text_processing.text_normalization.sv.utils import get_abs_path as get_tn_abs_path from nemo_text_processing.text_normalization.sv.utils import load_labels -from pynini.lib import pynutil QUARTERS = {15: "kvart över", 30: "halv", 45: "kvart i"} diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/tokenize_and_classify.py index 7fd0eb3a3..f8eef59d3 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.taggers.punctuation import PunctuationFst from nemo_text_processing.inverse_text_normalization.en.taggers.word import WordFst from nemo_text_processing.inverse_text_normalization.sv.taggers.cardinal import CardinalFst @@ -42,7 +44,6 @@ from nemo_text_processing.text_normalization.sv.taggers.telephone import TelephoneFst as TNTelephoneTagger from nemo_text_processing.text_normalization.sv.verbalizers.electronic import ElectronicFst as TNElectronicVerbalizer from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/whitelist.py index df88a65a8..97a2fcf3b 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/whitelist.py @@ -16,6 +16,8 @@ import os +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.sv.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_LOWER_CASED, @@ -23,7 +25,6 @@ convert_space, string_map_cased, ) -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/sv/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/sv/verbalizers/cardinal.py index 92a83625a..b13382a8e 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/sv/verbalizers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/sv/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/sv/verbalizers/date.py index 55ee362e3..e5c3f99e2 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/sv/verbalizers/date.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SPACE, @@ -20,7 +22,6 @@ delete_preserve_order, delete_space, ) -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/sv/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/sv/verbalizers/decimal.py index 48b22dbfa..d3ab30c74 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/sv/verbalizers/decimal.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, @@ -20,7 +22,6 @@ delete_space, ) from nemo_text_processing.text_normalization.sv.graph_utils import ensure_space -from pynini.lib import pynutil class DecimalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/sv/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/sv/verbalizers/time.py index f8489ba7b..81aba6d98 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/sv/verbalizers/time.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_NOT_QUOTE, @@ -20,7 +22,6 @@ GraphFst, delete_space, ) -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/sv/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/sv/verbalizers/verbalize_final.py index 61b6433e3..272f047e1 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/sv/verbalizers/verbalize_final.py @@ -13,10 +13,11 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.inverse_text_normalization.sv.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/vi/graph_utils.py b/nemo_text_processing/inverse_text_normalization/vi/graph_utils.py index e528e4cbe..a04fd73ca 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/vi/graph_utils.py @@ -19,11 +19,12 @@ from typing import Dict import pynini -from nemo_text_processing.utils.logging import logger from pynini import Far from pynini.export import export from pynini.lib import byte, pynutil, utf8 +from nemo_text_processing.utils.logging import logger + NEMO_CHAR = utf8.VALID_UTF8_CHAR NEMO_DIGIT = byte.DIGIT diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py index d25c96071..016df4f1d 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py @@ -15,6 +15,8 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.vi.graph_utils import ( NEMO_DIGIT, NEMO_SPACE, @@ -22,7 +24,6 @@ delete_space, ) from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path -from pynini.lib import pynutil class CardinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py index e1986f69c..b0cd8561a 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.vi.graph_utils import GraphFst, delete_extra_space, delete_space from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path -from pynini.lib import pynutil graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")).optimize() graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py index 298319306..033f3d86e 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py @@ -14,6 +14,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.vi.graph_utils import ( NEMO_DIGIT, GraphFst, @@ -21,7 +23,6 @@ delete_space, ) from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path -from pynini.lib import pynutil graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/electronic.py index 3eb0c8886..e7f5b3697 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/electronic.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_ALPHA, GraphFst, insert_space from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path -from pynini.lib import pynutil class ElectronicFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/fraction.py index 7798be654..9aacf93bd 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/fraction.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.vi.graph_utils import GraphFst, delete_extra_space, delete_space from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.vi.graph_utils import GraphFst, delete_extra_space, delete_space + class FractionFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/measure.py index 0b1f08b87..6ffa64b04 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/measure.py @@ -14,6 +14,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.vi.graph_utils import ( GraphFst, convert_space, @@ -21,7 +23,6 @@ delete_space, ) from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/money.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/money.py index b278773da..414beab1e 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/money.py @@ -14,6 +14,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.vi.graph_utils import ( NEMO_DIGIT, GraphFst, @@ -21,7 +23,6 @@ delete_extra_space, ) from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/ordinal.py index a7f79a1a8..98b3ac981 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/ordinal.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.vi.graph_utils import GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path -from pynini.lib import pynutil class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/punctuation.py index cbfe13063..4c96e0d0a 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/punctuation.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.vi.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.vi.graph_utils import GraphFst + class PunctuationFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/telephone.py index 202547a40..52b1a6124 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/telephone.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.vi.graph_utils import GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path -from pynini.lib import pynutil class TelephoneFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/time.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/time.py index c13d6467e..529744b10 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/time.py @@ -15,6 +15,8 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.vi.graph_utils import ( GraphFst, convert_space, @@ -23,7 +25,6 @@ insert_space, ) from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/tokenize_and_classify.py index 704c640c2..04edee6bd 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/tokenize_and_classify.py @@ -16,6 +16,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.vi.graph_utils import ( GraphFst, delete_extra_space, @@ -37,7 +39,6 @@ from nemo_text_processing.inverse_text_normalization.vi.taggers.word import WordFst from nemo_text_processing.text_normalization.en.graph_utils import INPUT_LOWER_CASED from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/whitelist.py index 027a7af73..f9479d500 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/whitelist.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.vi.graph_utils import GraphFst, convert_space from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/word.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/word.py index 3e1367d70..4ec96b077 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/word.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_NOT_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_NOT_SPACE, GraphFst + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/cardinal.py index 584758a3a..c1eda376a 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/date.py index d746528db..c7d3f21b6 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/date.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class DateFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/decimal.py index c016c20e0..1d039ea1d 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class DecimalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/electronic.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/electronic.py index 755172492..582a1e1b3 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class ElectronicFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/fraction.py index e5e87767b..b19502446 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/fraction.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class FractionFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/measure.py index abc06f724..9cf68abb8 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/measure.py @@ -14,13 +14,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.vi.graph_utils import ( NEMO_CHAR, NEMO_NOT_QUOTE, GraphFst, delete_space, ) -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/money.py index dbea81ff4..8fdb193a6 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/money.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_CHAR, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_CHAR, GraphFst, delete_space + class MoneyFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/ordinal.py index c45e1ee7b..8ada63998 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/ordinal.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class OrdinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/telephone.py index d0e8de0b8..74d28decf 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/telephone.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst + class TelephoneFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py index 446160611..30d262722 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py @@ -14,6 +14,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.vi.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, @@ -21,7 +23,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/verbalize_final.py index 40899e191..9b982f965 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/verbalize_final.py @@ -14,10 +14,11 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.vi.graph_utils import GraphFst, delete_extra_space, delete_space from nemo_text_processing.inverse_text_normalization.vi.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.vi.verbalizers.word import WordFst -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/whitelist.py index 558c75451..a0f630c4e 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/whitelist.py @@ -15,13 +15,14 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.vi.graph_utils import ( NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space, ) -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/word.py index 3d7d6f8db..afd0c6a0c 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/word.py @@ -14,13 +14,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.vi.graph_utils import ( NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space, ) -from pynini.lib import pynutil class WordFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index c1b462472..c99ae25d2 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path -from pynini.lib import pynutil class CardinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py index 55e77aeba..331f0b7ff 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py index 33f437955..8b702a0fe 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path -from pynini.lib import pynutil def get_quantity(decimal, cardinal): diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py index 33fcd20a9..c4911e832 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py @@ -13,9 +13,10 @@ # limitations under the License. -from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst + class FractionFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py index d0a24ab3b..e660b6015 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, GraphFst from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py index 47ffbdd36..3c3ba1fdc 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst + class OrdinalFst(GraphFst): def __init__(self, cardinal: GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py index 4ca8eab9b..320f9272a 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst + class PunctuationFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py index 9a3aca388..288043367 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py index 3316bd336..96266df25 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( GraphFst, delete_extra_space, @@ -32,7 +34,6 @@ from nemo_text_processing.inverse_text_normalization.zh.taggers.whitelist import WhiteListFst from nemo_text_processing.inverse_text_normalization.zh.taggers.word import WordFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py index 8e0cbd328..c8ed1c2a3 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py @@ -15,9 +15,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.zh.graph_utils import INPUT_LOWER_CASED, GraphFst from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/word.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/word.py index 3e129fb98..04c2d7916 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/word.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_NOT_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_NOT_SPACE, GraphFst + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py index 3eec1a88b..31d5880dc 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, GraphFst, delete_space, ) -from pynini.lib import pynutil class CardinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py index 2b979e6b8..60606aea0 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class DateFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py index ab9831783..28e2d5ff1 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( NEMO_DIGIT, NEMO_NOT_QUOTE, GraphFst, delete_space, ) -from pynini.lib import pynutil class DecimalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py index d5ea2ced1..57dcbd95f 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, GraphFst, delete_space + class FractionFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py index 2fd3919a4..92ba6d90d 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( NEMO_DIGIT, NEMO_NOT_QUOTE, GraphFst, delete_space, ) -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py index 93f2a678d..13731724e 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, GraphFst, delete_space, ) -from pynini.lib import pynutil class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py index 4560fdf62..f0ea1bd28 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, GraphFst, delete_space + class TimeFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py index e21b1d332..849cc690d 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py @@ -13,10 +13,11 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst, delete_extra_space, delete_space from nemo_text_processing.inverse_text_normalization.zh.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.zh.verbalizers.word import WordFst -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/whitelist.py index 994935b2b..df722ac25 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/whitelist.py @@ -14,13 +14,14 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space, ) -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py index 5888e2d8c..545de8af1 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space, ) -from pynini.lib import pynutil class WordFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ar/graph_utils.py b/nemo_text_processing/text_normalization/ar/graph_utils.py index 0315aada5..164bd6f78 100644 --- a/nemo_text_processing/text_normalization/ar/graph_utils.py +++ b/nemo_text_processing/text_normalization/ar/graph_utils.py @@ -24,8 +24,8 @@ try: import pynini from pynini import Far - from pynini.export import export from pynini.examples import plurals + from pynini.export import export from pynini.lib import byte, pynutil, utf8 NEMO_CHAR = utf8.VALID_UTF8_CHAR diff --git a/nemo_text_processing/text_normalization/ar/taggers/__init__.py b/nemo_text_processing/text_normalization/ar/taggers/__init__.py index 67e5340b8..8944d9e6c 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/__init__.py +++ b/nemo_text_processing/text_normalization/ar/taggers/__init__.py @@ -13,6 +13,7 @@ # limitations under the License. import pynini + from nemo_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/text_normalization/ar/taggers/cardinal.py b/nemo_text_processing/text_normalization/ar/taggers/cardinal.py index 1ac137e78..9a8ba7cd4 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/ar/taggers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.ar.graph_utils import GraphFst, flop_digits, insert_and, insert_space from nemo_text_processing.text_normalization.ar.utils import get_abs_path -from pynini.lib import pynutil class CardinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ar/taggers/decimal.py b/nemo_text_processing/text_normalization/ar/taggers/decimal.py index 9fd134399..1d777f1b6 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/ar/taggers/decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_DIGIT, NEMO_SPACE, GraphFst, insert_space from nemo_text_processing.text_normalization.ar.utils import get_abs_path -from pynini.lib import pynutil def get_quantity(decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike") -> "pynini.FstLike": diff --git a/nemo_text_processing/text_normalization/ar/taggers/fraction.py b/nemo_text_processing/text_normalization/ar/taggers/fraction.py index 74e568588..aad046011 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/ar/taggers/fraction.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.ar.graph_utils import GraphFst from nemo_text_processing.text_normalization.ar.utils import get_abs_path -from pynini.lib import pynutil class FractionFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ar/taggers/measure.py b/nemo_text_processing/text_normalization/ar/taggers/measure.py index 2aca51201..707b40998 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/measure.py +++ b/nemo_text_processing/text_normalization/ar/taggers/measure.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.ar.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, @@ -22,7 +24,6 @@ insert_space, ) from nemo_text_processing.text_normalization.ar.utils import get_abs_path -from pynini.lib import pynutil unit_singular = pynini.string_file(get_abs_path("data/measure/measurements.tsv")) diff --git a/nemo_text_processing/text_normalization/ar/taggers/money.py b/nemo_text_processing/text_normalization/ar/taggers/money.py index 266edf652..0df176491 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/money.py +++ b/nemo_text_processing/text_normalization/ar/taggers/money.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.ar.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, @@ -21,7 +23,6 @@ insert_space, ) from nemo_text_processing.text_normalization.ar.utils import get_abs_path, load_labels -from pynini.lib import pynutil min_singular = pynini.string_file(get_abs_path("data/money/currency_minor_singular.tsv")) min_plural = pynini.string_file(get_abs_path("data/money/currency_minor_plural.tsv")) diff --git a/nemo_text_processing/text_normalization/ar/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ar/taggers/tokenize_and_classify.py index a154ffa9e..5f3155ae2 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/ar/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.ar.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, @@ -31,7 +33,6 @@ from nemo_text_processing.text_normalization.ar.taggers.word import WordFst from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ar/taggers/word.py b/nemo_text_processing/text_normalization/ar/taggers/word.py index facd67bce..26a413028 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/word.py +++ b/nemo_text_processing/text_normalization/ar/taggers/word.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/ar/verbalizers/__init__.py b/nemo_text_processing/text_normalization/ar/verbalizers/__init__.py index 67e5340b8..8944d9e6c 100644 --- a/nemo_text_processing/text_normalization/ar/verbalizers/__init__.py +++ b/nemo_text_processing/text_normalization/ar/verbalizers/__init__.py @@ -13,6 +13,7 @@ # limitations under the License. import pynini + from nemo_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/text_normalization/ar/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/ar/verbalizers/cardinal.py index 2125e9a65..ec3a51838 100644 --- a/nemo_text_processing/text_normalization/ar/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/ar/verbalizers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/ar/verbalizers/decimal.py b/nemo_text_processing/text_normalization/ar/verbalizers/decimal.py index 28707f872..2903f435c 100644 --- a/nemo_text_processing/text_normalization/ar/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/ar/verbalizers/decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class DecimalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/ar/verbalizers/fraction.py b/nemo_text_processing/text_normalization/ar/verbalizers/fraction.py index 1b3330cec..3ce7e3ddd 100644 --- a/nemo_text_processing/text_normalization/ar/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/ar/verbalizers/fraction.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, insert_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_NOT_QUOTE, GraphFst, insert_space + class FractionFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/ar/verbalizers/measure.py b/nemo_text_processing/text_normalization/ar/verbalizers/measure.py index e2f3d19f8..aaca02de0 100644 --- a/nemo_text_processing/text_normalization/ar/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/ar/verbalizers/measure.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.ar.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, delete_extra_space, delete_preserve_order, ) -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ar/verbalizers/money.py b/nemo_text_processing/text_normalization/ar/verbalizers/money.py index 47e7e7d18..46da10742 100644 --- a/nemo_text_processing/text_normalization/ar/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/ar/verbalizers/money.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.ar.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, delete_preserve_order, delete_space, ) -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ar/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ar/verbalizers/verbalize_final.py index 69536a913..8388f8e84 100644 --- a/nemo_text_processing/text_normalization/ar/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/ar/verbalizers/verbalize_final.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.ar.graph_utils import ( GraphFst, delete_extra_space, @@ -24,7 +26,6 @@ from nemo_text_processing.text_normalization.ar.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.text_normalization.ar.verbalizers.word import WordFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ar/verbalizers/word.py b/nemo_text_processing/text_normalization/ar/verbalizers/word.py index 17ec7a310..eb0e2d2c7 100644 --- a/nemo_text_processing/text_normalization/ar/verbalizers/word.py +++ b/nemo_text_processing/text_normalization/ar/verbalizers/word.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/de/taggers/cardinal.py b/nemo_text_processing/text_normalization/de/taggers/cardinal.py index 8f1a13b0c..bb14d2c95 100644 --- a/nemo_text_processing/text_normalization/de/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/de/taggers/cardinal.py @@ -15,6 +15,8 @@ from collections import defaultdict import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.utils import get_abs_path, load_labels from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, @@ -23,7 +25,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil AND = "und" diff --git a/nemo_text_processing/text_normalization/de/taggers/date.py b/nemo_text_processing/text_normalization/de/taggers/date.py index ab56ad5ce..673bd8868 100644 --- a/nemo_text_processing/text_normalization/de/taggers/date.py +++ b/nemo_text_processing/text_normalization/de/taggers/date.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.utils import get_abs_path, load_labels from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, @@ -21,7 +23,6 @@ GraphFst, insert_space, ) -from pynini.lib import pynutil graph_teen = pynini.invert(pynini.string_file(get_abs_path("data/numbers/teen.tsv"))).optimize() graph_digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))).optimize() diff --git a/nemo_text_processing/text_normalization/de/taggers/decimal.py b/nemo_text_processing/text_normalization/de/taggers/decimal.py index d5cd58ed0..6381d942b 100644 --- a/nemo_text_processing/text_normalization/de/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/de/taggers/decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, insert_space -from pynini.lib import pynutil quantities = pynini.string_file(get_abs_path("data/numbers/quantities.tsv")) diff --git a/nemo_text_processing/text_normalization/de/taggers/electronic.py b/nemo_text_processing/text_normalization/de/taggers/electronic.py index 53a859700..8bca4646b 100644 --- a/nemo_text_processing/text_normalization/de/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/de/taggers/electronic.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.utils import get_abs_path, load_labels from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, insert_space -from pynini.lib import pynutil class ElectronicFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/de/taggers/fraction.py b/nemo_text_processing/text_normalization/de/taggers/fraction.py index 71d51c600..31c113842 100644 --- a/nemo_text_processing/text_normalization/de/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/de/taggers/fraction.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + class FractionFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/de/taggers/measure.py b/nemo_text_processing/text_normalization/de/taggers/measure.py index c9dbe4120..122ff8a67 100644 --- a/nemo_text_processing/text_normalization/de/taggers/measure.py +++ b/nemo_text_processing/text_normalization/de/taggers/measure.py @@ -13,6 +13,9 @@ # limitations under the License. import pynini +from pynini.examples import plurals +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, @@ -23,8 +26,6 @@ convert_space, insert_space, ) -from pynini.examples import plurals -from pynini.lib import pynutil unit_singular = pynini.string_file(get_abs_path("data/measure/measurements.tsv")) suppletive = pynini.string_file(get_abs_path("data/measure/suppletive.tsv")) diff --git a/nemo_text_processing/text_normalization/de/taggers/money.py b/nemo_text_processing/text_normalization/de/taggers/money.py index bc27d4159..6c32d4706 100644 --- a/nemo_text_processing/text_normalization/de/taggers/money.py +++ b/nemo_text_processing/text_normalization/de/taggers/money.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.utils import get_abs_path, load_labels from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, @@ -22,7 +24,6 @@ convert_space, insert_space, ) -from pynini.lib import pynutil min_singular = pynini.string_file(get_abs_path("data/money/currency_minor_singular.tsv")) min_plural = pynini.string_file(get_abs_path("data/money/currency_minor_plural.tsv")) diff --git a/nemo_text_processing/text_normalization/de/taggers/ordinal.py b/nemo_text_processing/text_normalization/de/taggers/ordinal.py index 564466551..f446099df 100644 --- a/nemo_text_processing/text_normalization/de/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/de/taggers/ordinal.py @@ -16,9 +16,10 @@ # Russian minimally supervised number grammar. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst + class OrdinalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/de/taggers/telephone.py b/nemo_text_processing/text_normalization/de/taggers/telephone.py index f4306b9be..f40173b0f 100644 --- a/nemo_text_processing/text_normalization/de/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/de/taggers/telephone.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, insert_space -from pynini.lib import pynutil class TelephoneFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/de/taggers/time.py b/nemo_text_processing/text_normalization/de/taggers/time.py index b9e244635..e9088a4a1 100644 --- a/nemo_text_processing/text_normalization/de/taggers/time.py +++ b/nemo_text_processing/text_normalization/de/taggers/time.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, convert_space, insert_space -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py index 46eeb09c0..bfcc295b6 100644 --- a/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.taggers.cardinal import CardinalFst from nemo_text_processing.text_normalization.de.taggers.date import DateFst from nemo_text_processing.text_normalization.de.taggers.decimal import DecimalFst @@ -37,7 +39,6 @@ ) from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/de/taggers/whitelist.py b/nemo_text_processing/text_normalization/de/taggers/whitelist.py index 4e8829bf1..c56fb138f 100644 --- a/nemo_text_processing/text_normalization/de/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/de/taggers/whitelist.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.utils import get_abs_path, load_labels from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/de/taggers/word.py b/nemo_text_processing/text_normalization/de/taggers/word.py index 16743d28b..7e0df2a95 100644 --- a/nemo_text_processing/text_normalization/de/taggers/word.py +++ b/nemo_text_processing/text_normalization/de/taggers/word.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/de/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/de/verbalizers/cardinal.py index 6adf17dd7..6b6bbf356 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/de/verbalizers/date.py b/nemo_text_processing/text_normalization/de/verbalizers/date.py index 2a1a8ed72..20e775d28 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/date.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.utils import get_abs_path, load_labels from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, @@ -20,7 +22,6 @@ GraphFst, delete_preserve_order, ) -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/de/verbalizers/decimal.py b/nemo_text_processing/text_normalization/de/verbalizers/decimal.py index 4e5ac7f94..915d5ab67 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/decimal.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.taggers.decimal import quantities from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, @@ -20,7 +22,6 @@ delete_preserve_order, insert_space, ) -from pynini.lib import pynutil class DecimalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/de/verbalizers/electronic.py b/nemo_text_processing/text_normalization/de/verbalizers/electronic.py index 184ce01b1..b3c2a378f 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/electronic.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, @@ -21,7 +23,6 @@ delete_preserve_order, insert_space, ) -from pynini.lib import pynutil class ElectronicFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/de/verbalizers/fraction.py b/nemo_text_processing/text_normalization/de/verbalizers/fraction.py index 2221a2912..961e4429c 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/fraction.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, @@ -20,7 +22,6 @@ delete_preserve_order, insert_space, ) -from pynini.lib import pynutil class FractionFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/de/verbalizers/measure.py b/nemo_text_processing/text_normalization/de/verbalizers/measure.py index b9bcd190e..41f7fb89c 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/measure.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, delete_extra_space, delete_preserve_order, ) -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/de/verbalizers/money.py b/nemo_text_processing/text_normalization/de/verbalizers/money.py index 94fde2482..ff29e8965 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/money.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_preserve_order from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_preserve_order + class MoneyFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py index 917337c70..f8d5f6967 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst -from pynini.lib import pynutil class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/de/verbalizers/telephone.py b/nemo_text_processing/text_normalization/de/verbalizers/telephone.py index 25fbeae8c..7a50e785f 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/telephone.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_preserve_order from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_preserve_order + class TelephoneFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/de/verbalizers/time.py b/nemo_text_processing/text_normalization/de/verbalizers/time.py index f6ba49332..a83f09f2a 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/time.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.utils import get_abs_path, load_labels from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, @@ -21,7 +23,6 @@ convert_space, delete_preserve_order, ) -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/de/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/de/verbalizers/verbalize_final.py index b2c7f4503..f4e19ea0f 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/verbalize_final.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.de.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.text_normalization.en.graph_utils import ( GraphFst, @@ -24,7 +26,6 @@ ) from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/clean_eval_data.py b/nemo_text_processing/text_normalization/en/clean_eval_data.py index 8c33c4fa6..a7dc24310 100644 --- a/nemo_text_processing/text_normalization/en/clean_eval_data.py +++ b/nemo_text_processing/text_normalization/en/clean_eval_data.py @@ -16,6 +16,7 @@ from typing import List import regex as re + from nemo_text_processing.text_normalization.data_loader_utils import ( EOS_TYPE, Instance, @@ -23,7 +24,6 @@ training_data_to_sentences, ) - """ This file is for evaluation purposes. filter_loaded_data() cleans data (list of instances) for text normalization. Filters and cleaners can be specified for each semiotic class individually. diff --git a/nemo_text_processing/text_normalization/en/graph_utils.py b/nemo_text_processing/text_normalization/en/graph_utils.py index a38dfa175..239e1b282 100644 --- a/nemo_text_processing/text_normalization/en/graph_utils.py +++ b/nemo_text_processing/text_normalization/en/graph_utils.py @@ -19,13 +19,14 @@ from typing import Dict import pynini -from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels -from nemo_text_processing.utils.logging import logger from pynini import Far from pynini.examples import plurals from pynini.export import export from pynini.lib import byte, pynutil, utf8 +from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels +from nemo_text_processing.utils.logging import logger + NEMO_CHAR = utf8.VALID_UTF8_CHAR NEMO_DIGIT = byte.DIGIT diff --git a/nemo_text_processing/text_normalization/en/taggers/abbreviation.py b/nemo_text_processing/text_normalization/en/taggers/abbreviation.py index 640bb487d..f4abc3352 100644 --- a/nemo_text_processing/text_normalization/en/taggers/abbreviation.py +++ b/nemo_text_processing/text_normalization/en/taggers/abbreviation.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_UPPER, GraphFst, insert_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_UPPER, GraphFst, insert_space + class AbbreviationFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/en/taggers/cardinal.py b/nemo_text_processing/text_normalization/en/taggers/cardinal.py index 4249d6cbd..616e018e3 100644 --- a/nemo_text_processing/text_normalization/en/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/en/taggers/cardinal.py @@ -14,6 +14,9 @@ import pynini +from pynini.examples import plurals +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_NOT_QUOTE, @@ -23,8 +26,6 @@ ) from nemo_text_processing.text_normalization.en.taggers.date import get_four_digit_year_graph from nemo_text_processing.text_normalization.en.utils import get_abs_path -from pynini.examples import plurals -from pynini.lib import pynutil class CardinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/taggers/date.py b/nemo_text_processing/text_normalization/en/taggers/date.py index 93cca2395..c5e3dd418 100644 --- a/nemo_text_processing/text_normalization/en/taggers/date.py +++ b/nemo_text_processing/text_normalization/en/taggers/date.py @@ -13,6 +13,9 @@ # limitations under the License. import pynini +from pynini.examples import plurals +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, @@ -29,8 +32,6 @@ get_abs_path, load_labels, ) -from pynini.examples import plurals -from pynini.lib import pynutil graph_teen = pynini.invert(pynini.string_file(get_abs_path("data/number/teen.tsv"))).optimize() graph_digit = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize() diff --git a/nemo_text_processing/text_normalization/en/taggers/decimal.py b/nemo_text_processing/text_normalization/en/taggers/decimal.py index 2486b5f8c..df9a3bddb 100644 --- a/nemo_text_processing/text_normalization/en/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/en/taggers/decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, TO_UPPER, GraphFst, get_abs_path from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, TO_UPPER, GraphFst, get_abs_path + delete_space = pynutil.delete(" ") quantities = pynini.string_file(get_abs_path("data/number/thousand.tsv")) quantities_abbr = pynini.string_file(get_abs_path("data/number/quantity_abbr.tsv")) diff --git a/nemo_text_processing/text_normalization/en/taggers/electronic.py b/nemo_text_processing/text_normalization/en/taggers/electronic.py index c347bfe1a..22975ab95 100644 --- a/nemo_text_processing/text_normalization/en/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/en/taggers/electronic.py @@ -14,6 +14,8 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( MIN_NEG_WEIGHT, NEMO_ALPHA, @@ -26,7 +28,6 @@ get_abs_path, insert_space, ) -from pynini.lib import pynutil class ElectronicFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/taggers/fraction.py b/nemo_text_processing/text_normalization/en/taggers/fraction.py index ac6877c22..2b33bd7bb 100644 --- a/nemo_text_processing/text_normalization/en/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/en/taggers/fraction.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, get_abs_path from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, get_abs_path + class FractionFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/en/taggers/measure.py b/nemo_text_processing/text_normalization/en/taggers/measure.py index 3878069b8..43f0a66d2 100644 --- a/nemo_text_processing/text_normalization/en/taggers/measure.py +++ b/nemo_text_processing/text_normalization/en/taggers/measure.py @@ -13,6 +13,9 @@ # limitations under the License. import pynini +from pynini.examples import plurals +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, @@ -32,8 +35,6 @@ from nemo_text_processing.text_normalization.en.taggers.whitelist import get_formats from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels from nemo_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst as OrdinalVerbalizer -from pynini.examples import plurals -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/taggers/money.py b/nemo_text_processing/text_normalization/en/taggers/money.py index 43e26bda7..f0b31b43a 100644 --- a/nemo_text_processing/text_normalization/en/taggers/money.py +++ b/nemo_text_processing/text_normalization/en/taggers/money.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, @@ -23,7 +25,6 @@ insert_space, ) from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels -from pynini.lib import pynutil min_singular = pynini.string_file(get_abs_path("data/money/currency_minor_singular.tsv")) min_plural = pynini.string_file(get_abs_path("data/money/currency_minor_plural.tsv")) diff --git a/nemo_text_processing/text_normalization/en/taggers/ordinal.py b/nemo_text_processing/text_normalization/en/taggers/ordinal.py index bb4cc007f..70ae2d70d 100644 --- a/nemo_text_processing/text_normalization/en/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/en/taggers/ordinal.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst + class OrdinalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/en/taggers/punctuation.py b/nemo_text_processing/text_normalization/en/taggers/punctuation.py index 769b020ce..56d2cdcb2 100644 --- a/nemo_text_processing/text_normalization/en/taggers/punctuation.py +++ b/nemo_text_processing/text_normalization/en/taggers/punctuation.py @@ -16,11 +16,12 @@ from unicodedata import category import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, NEMO_SIGMA, GraphFst -from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels from pynini.examples import plurals from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, NEMO_SIGMA, GraphFst +from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels + class PunctuationFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/en/taggers/range.py b/nemo_text_processing/text_normalization/en/taggers/range.py index 92c0c325a..5e0d017d4 100644 --- a/nemo_text_processing/text_normalization/en/taggers/range.py +++ b/nemo_text_processing/text_normalization/en/taggers/range.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, convert_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, convert_space + class RangeFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/en/taggers/roman.py b/nemo_text_processing/text_normalization/en/taggers/roman.py index e12ee4a2b..d0f394273 100644 --- a/nemo_text_processing/text_normalization/en/taggers/roman.py +++ b/nemo_text_processing/text_normalization/en/taggers/roman.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_SIGMA, GraphFst from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels -from pynini.lib import pynutil class RomanFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/taggers/serial.py b/nemo_text_processing/text_normalization/en/taggers/serial.py index 73b6c4e4f..e1a76dd63 100644 --- a/nemo_text_processing/text_normalization/en/taggers/serial.py +++ b/nemo_text_processing/text_normalization/en/taggers/serial.py @@ -14,6 +14,9 @@ import pynini +from pynini.examples import plurals +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, @@ -23,8 +26,6 @@ convert_space, ) from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels -from pynini.examples import plurals -from pynini.lib import pynutil class SerialFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/taggers/telephone.py b/nemo_text_processing/text_normalization/en/taggers/telephone.py index 1caedffdd..06d791264 100644 --- a/nemo_text_processing/text_normalization/en/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/en/taggers/telephone.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, @@ -24,7 +26,6 @@ plurals, ) from nemo_text_processing.text_normalization.en.utils import get_abs_path -from pynini.lib import pynutil class TelephoneFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/taggers/time.py b/nemo_text_processing/text_normalization/en/taggers/time.py index 4020996cc..a66f18314 100644 --- a/nemo_text_processing/text_normalization/en/taggers/time.py +++ b/nemo_text_processing/text_normalization/en/taggers/time.py @@ -14,6 +14,8 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, GraphFst, @@ -26,7 +28,6 @@ get_abs_path, load_labels, ) -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py index b3ac3ed75..bb4665293 100644 --- a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py @@ -16,6 +16,8 @@ import time import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_WHITE_SPACE, GraphFst, @@ -44,7 +46,6 @@ from nemo_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst as vOrdinalFst from nemo_text_processing.text_normalization.en.verbalizers.time import TimeFst as vTimeFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_lm.py b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_lm.py index 5f51763f6..95c22bcbe 100644 --- a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_lm.py +++ b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_lm.py @@ -15,6 +15,9 @@ import os import pynini +from pynini.examples import plurals +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, @@ -55,8 +58,6 @@ from nemo_text_processing.text_normalization.en.verbalizers.time import TimeFst as vTime from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst as vWord from nemo_text_processing.utils.logging import logger -from pynini.examples import plurals -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_with_audio.py b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_with_audio.py index f0cca88e5..110747cab 100644 --- a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_with_audio.py +++ b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_with_audio.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, @@ -56,7 +58,6 @@ from nemo_text_processing.text_normalization.en.verbalizers.time import TimeFst as vTime from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst as vWord from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/taggers/whitelist.py b/nemo_text_processing/text_normalization/en/taggers/whitelist.py index a51edec82..8d19a714e 100644 --- a/nemo_text_processing/text_normalization/en/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/en/taggers/whitelist.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, INPUT_LOWER_CASED, @@ -30,7 +32,6 @@ get_abs_path, load_labels, ) -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/taggers/word.py b/nemo_text_processing/text_normalization/en/taggers/word.py index fa6a965aa..b988d4c9d 100644 --- a/nemo_text_processing/text_normalization/en/taggers/word.py +++ b/nemo_text_processing/text_normalization/en/taggers/word.py @@ -13,6 +13,9 @@ # limitations under the License. import pynini +from pynini.examples import plurals +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( MIN_NEG_WEIGHT, NEMO_ALPHA, @@ -24,8 +27,6 @@ get_abs_path, ) from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst -from pynini.examples import plurals -from pynini.lib import pynutil class WordFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/verbalizers/abbreviation.py b/nemo_text_processing/text_normalization/en/verbalizers/abbreviation.py index 191792431..95713b20d 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/abbreviation.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/abbreviation.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class AbbreviationFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/en/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/en/verbalizers/cardinal.py index eab85015d..a398f892a 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/en/verbalizers/date.py b/nemo_text_processing/text_normalization/en/verbalizers/date.py index 6a2cd12b9..548f18219 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/date.py @@ -13,6 +13,9 @@ # limitations under the License. import pynini +from pynini.examples import plurals +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, @@ -20,8 +23,6 @@ delete_extra_space, delete_space, ) -from pynini.examples import plurals -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/verbalizers/decimal.py b/nemo_text_processing/text_normalization/en/verbalizers/decimal.py index 9285edf20..6aa3e9af0 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/decimal.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, @@ -20,7 +22,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class DecimalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/verbalizers/electronic.py b/nemo_text_processing/text_normalization/en/verbalizers/electronic.py index 1e3fc9b50..98afc3f24 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/electronic.py @@ -13,6 +13,9 @@ # limitations under the License. import pynini +from pynini.examples import plurals +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( MIN_NEG_WEIGHT, NEMO_ALPHA, @@ -28,8 +31,6 @@ insert_space, ) from nemo_text_processing.text_normalization.en.utils import get_abs_path -from pynini.examples import plurals -from pynini.lib import pynutil class ElectronicFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/verbalizers/fraction.py b/nemo_text_processing/text_normalization/en/verbalizers/fraction.py index d0c5dc2b1..b4b501210 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/fraction.py @@ -13,11 +13,12 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, insert_space -from nemo_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst from pynini.examples import plurals from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, insert_space +from nemo_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst + class FractionFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/en/verbalizers/measure.py b/nemo_text_processing/text_normalization/en/verbalizers/measure.py index 8f9bd2108..ae5fa8800 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/measure.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, @@ -20,7 +22,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/verbalizers/money.py b/nemo_text_processing/text_normalization/en/verbalizers/money.py index b3cbc4a31..5341c8001 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/money.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, delete_extra_space, delete_preserve_order, ) -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py index c64579ae5..4ad7d1c85 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_space from nemo_text_processing.text_normalization.en.utils import get_abs_path -from pynini.lib import pynutil class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/en/verbalizers/post_processing.py index 2075c0389..85d0693cf 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/post_processing.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/post_processing.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( MIN_NEG_WEIGHT, NEMO_ALPHA, @@ -25,7 +27,6 @@ ) from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class PostProcessingFst: diff --git a/nemo_text_processing/text_normalization/en/verbalizers/roman.py b/nemo_text_processing/text_normalization/en/verbalizers/roman.py index 43faebe76..72ab7eee8 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/roman.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/roman.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from nemo_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst -from pynini.lib import pynutil class RomanFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/verbalizers/telephone.py b/nemo_text_processing/text_normalization/en/verbalizers/telephone.py index 4af7bbb87..0fb51bcba 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/telephone.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + class TelephoneFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/en/verbalizers/time.py b/nemo_text_processing/text_normalization/en/verbalizers/time.py index 518c7dfa2..34503eb1d 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/time.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, @@ -20,7 +22,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/en/verbalizers/verbalize_final.py index f8212e9f7..6c4fdbc8f 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/verbalize_final.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( GraphFst, delete_extra_space, @@ -24,7 +26,6 @@ from nemo_text_processing.text_normalization.en.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/en/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/en/verbalizers/whitelist.py index 96aa2075b..926de207b 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/whitelist.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + class WhiteListFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/en/verbalizers/word.py b/nemo_text_processing/text_normalization/en/verbalizers/word.py index e124f42ff..5e5dddd21 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/word.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/word.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/es/graph_utils.py b/nemo_text_processing/text_normalization/es/graph_utils.py index f9f3dc22b..24ae0aee3 100644 --- a/nemo_text_processing/text_normalization/es/graph_utils.py +++ b/nemo_text_processing/text_normalization/es/graph_utils.py @@ -13,10 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, NEMO_SPACE from nemo_text_processing.text_normalization.es import LOCALIZATION from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels -from pynini.lib import pynutil digits = pynini.project(pynini.string_file(get_abs_path("data/numbers/digit.tsv")), "input") tens = pynini.project(pynini.string_file(get_abs_path("data/numbers/ties.tsv")), "input") diff --git a/nemo_text_processing/text_normalization/es/taggers/cardinal.py b/nemo_text_processing/text_normalization/es/taggers/cardinal.py index c535100f7..1b8f0a440 100644 --- a/nemo_text_processing/text_normalization/es/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/es/taggers/cardinal.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, @@ -24,7 +26,6 @@ ) from nemo_text_processing.text_normalization.es.graph_utils import cardinal_separator from nemo_text_processing.text_normalization.es.utils import get_abs_path -from pynini.lib import pynutil zero = pynini.invert(pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))) diff --git a/nemo_text_processing/text_normalization/es/taggers/date.py b/nemo_text_processing/text_normalization/es/taggers/date.py index 63c76ff56..ea7f15292 100644 --- a/nemo_text_processing/text_normalization/es/taggers/date.py +++ b/nemo_text_processing/text_normalization/es/taggers/date.py @@ -12,10 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, NEMO_SPACE, GraphFst, delete_extra_space from nemo_text_processing.text_normalization.es.graph_utils import roman_to_int, strip_cardinal_apocope from nemo_text_processing.text_normalization.es.utils import get_abs_path -from pynini.lib import pynutil articles = pynini.union("de", "del", "el", "del año", "año") delete_leading_zero = (pynutil.delete("0") | (NEMO_DIGIT - "0")) + NEMO_DIGIT diff --git a/nemo_text_processing/text_normalization/es/taggers/decimals.py b/nemo_text_processing/text_normalization/es/taggers/decimals.py index 730643d4b..2a3b9c560 100644 --- a/nemo_text_processing/text_normalization/es/taggers/decimals.py +++ b/nemo_text_processing/text_normalization/es/taggers/decimals.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, @@ -26,7 +28,6 @@ strip_cardinal_apocope, ) from nemo_text_processing.text_normalization.es.utils import get_abs_path -from pynini.lib import pynutil quantities = pynini.string_file(get_abs_path("data/numbers/quantities.tsv")) digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))) diff --git a/nemo_text_processing/text_normalization/es/taggers/electronic.py b/nemo_text_processing/text_normalization/es/taggers/electronic.py index 13833bbe0..c984cb957 100644 --- a/nemo_text_processing/text_normalization/es/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/es/taggers/electronic.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, insert_space from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels -from pynini.lib import pynutil common_domains = [x[0] for x in load_labels(get_abs_path("data/electronic/domain.tsv"))] symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))] diff --git a/nemo_text_processing/text_normalization/es/taggers/fraction.py b/nemo_text_processing/text_normalization/es/taggers/fraction.py index 436b4fa20..1fb5b8118 100644 --- a/nemo_text_processing/text_normalization/es/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/es/taggers/fraction.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, @@ -20,7 +22,6 @@ GraphFst, ) from nemo_text_processing.text_normalization.es.utils import get_abs_path -from pynini.lib import pynutil ordinal_exceptions = pynini.string_file(get_abs_path("data/fractions/ordinal_exceptions.tsv")) higher_powers_of_ten = pynini.string_file(get_abs_path("data/fractions/powers_of_ten.tsv")) diff --git a/nemo_text_processing/text_normalization/es/taggers/measure.py b/nemo_text_processing/text_normalization/es/taggers/measure.py index 2d1eb3ff4..a1933dbed 100644 --- a/nemo_text_processing/text_normalization/es/taggers/measure.py +++ b/nemo_text_processing/text_normalization/es/taggers/measure.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_NON_BREAKING_SPACE, @@ -24,7 +26,6 @@ ) from nemo_text_processing.text_normalization.es.graph_utils import strip_cardinal_apocope from nemo_text_processing.text_normalization.es.utils import get_abs_path -from pynini.lib import pynutil unit = pynini.string_file(get_abs_path("data/measures/measurements.tsv")) unit_complex = pynini.string_file(get_abs_path("data/measures/measurements_complex.tsv")) diff --git a/nemo_text_processing/text_normalization/es/taggers/money.py b/nemo_text_processing/text_normalization/es/taggers/money.py index 3cb074550..bbb973465 100644 --- a/nemo_text_processing/text_normalization/es/taggers/money.py +++ b/nemo_text_processing/text_normalization/es/taggers/money.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, @@ -23,7 +25,6 @@ ) from nemo_text_processing.text_normalization.es.graph_utils import decimal_separator from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels -from pynini.lib import pynutil maj_singular_labels = load_labels(get_abs_path("data/money/currency_major.tsv")) maj_singular = pynini.string_file((get_abs_path("data/money/currency_major.tsv"))) diff --git a/nemo_text_processing/text_normalization/es/taggers/ordinal.py b/nemo_text_processing/text_normalization/es/taggers/ordinal.py index 8b73d4785..8af8773e5 100644 --- a/nemo_text_processing/text_normalization/es/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/es/taggers/ordinal.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, NEMO_SIGMA, @@ -21,7 +23,6 @@ ) from nemo_text_processing.text_normalization.es.graph_utils import roman_to_int, strip_accent from nemo_text_processing.text_normalization.es.utils import get_abs_path -from pynini.lib import pynutil digit = pynini.invert(pynini.string_file(get_abs_path("data/ordinals/digit.tsv"))) teens = pynini.invert(pynini.string_file(get_abs_path("data/ordinals/teen.tsv"))) diff --git a/nemo_text_processing/text_normalization/es/taggers/telephone.py b/nemo_text_processing/text_normalization/es/taggers/telephone.py index a1d95b468..83efc587c 100644 --- a/nemo_text_processing/text_normalization/es/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/es/taggers/telephone.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_SIGMA, @@ -21,7 +23,6 @@ ) from nemo_text_processing.text_normalization.es.graph_utils import ones from nemo_text_processing.text_normalization.es.utils import get_abs_path -from pynini.lib import pynutil graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) diff --git a/nemo_text_processing/text_normalization/es/taggers/time.py b/nemo_text_processing/text_normalization/es/taggers/time.py index e5bebcfdc..4a947dd31 100644 --- a/nemo_text_processing/text_normalization/es/taggers/time.py +++ b/nemo_text_processing/text_normalization/es/taggers/time.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, @@ -20,7 +22,6 @@ insert_space, ) from nemo_text_processing.text_normalization.es.utils import get_abs_path -from pynini.lib import pynutil time_zones = pynini.string_file(get_abs_path("data/time/time_zone.tsv")) suffix = pynini.string_file(get_abs_path("data/time/time_suffix.tsv")) diff --git a/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py index 68c5fb975..7512db12d 100644 --- a/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_WHITE_SPACE, GraphFst, @@ -36,7 +38,6 @@ from nemo_text_processing.text_normalization.es.taggers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.es.taggers.word import WordFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/es/taggers/whitelist.py b/nemo_text_processing/text_normalization/es/taggers/whitelist.py index 9f6c6d99b..ac591017b 100644 --- a/nemo_text_processing/text_normalization/es/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/es/taggers/whitelist.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/es/taggers/word.py b/nemo_text_processing/text_normalization/es/taggers/word.py index 90ecbc99c..cc2eeec2f 100644 --- a/nemo_text_processing/text_normalization/es/taggers/word.py +++ b/nemo_text_processing/text_normalization/es/taggers/word.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/es/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/es/verbalizers/cardinal.py index 1806d1477..972100be8 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/cardinal.py @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from nemo_text_processing.text_normalization.es.graph_utils import ( add_cardinal_apocope_fem, shift_cardinal_gender, strip_cardinal_apocope, ) -from pynini.lib import pynutil class CardinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/es/verbalizers/date.py b/nemo_text_processing/text_normalization/es/verbalizers/date.py index bf4393fee..586890d73 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/date.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, @@ -21,7 +23,6 @@ ) from nemo_text_processing.text_normalization.es.graph_utils import strip_cardinal_apocope from nemo_text_processing.text_normalization.es.taggers.date import articles -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/es/verbalizers/decimals.py b/nemo_text_processing/text_normalization/es/verbalizers/decimals.py index 643c9a5dd..3a94899fc 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/decimals.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/decimals.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, @@ -26,7 +28,6 @@ shift_number_gender, strip_cardinal_apocope, ) -from pynini.lib import pynutil class DecimalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/es/verbalizers/electronic.py b/nemo_text_processing/text_normalization/es/verbalizers/electronic.py index 0c866c550..d9fa598d9 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/electronic.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, @@ -20,7 +22,6 @@ insert_space, ) from nemo_text_processing.text_normalization.es.utils import get_abs_path -from pynini.lib import pynutil digit_no_zero = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))) zero = pynini.invert(pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) diff --git a/nemo_text_processing/text_normalization/es/verbalizers/fraction.py b/nemo_text_processing/text_normalization/es/verbalizers/fraction.py index fdbf83e68..094098f2e 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/fraction.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, NEMO_NOT_QUOTE, @@ -27,7 +29,6 @@ shift_cardinal_gender, strip_cardinal_apocope, ) -from pynini.lib import pynutil class FractionFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/es/verbalizers/measure.py b/nemo_text_processing/text_normalization/es/verbalizers/measure.py index f9659acb7..de877446d 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/measure.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, @@ -24,7 +26,6 @@ ) from nemo_text_processing.text_normalization.es.graph_utils import ones from nemo_text_processing.text_normalization.es.utils import get_abs_path -from pynini.lib import pynutil unit_plural_fem = pynini.string_file(get_abs_path("data/measures/measurements_plural_fem.tsv")) unit_plural_masc = pynini.string_file(get_abs_path("data/measures/measurements_plural_masc.tsv")) diff --git a/nemo_text_processing/text_normalization/es/verbalizers/money.py b/nemo_text_processing/text_normalization/es/verbalizers/money.py index d2ba9c707..e83e0444d 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/money.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, @@ -21,7 +23,6 @@ ) from nemo_text_processing.text_normalization.es.graph_utils import shift_cardinal_gender, strip_cardinal_apocope from nemo_text_processing.text_normalization.es.utils import get_abs_path -from pynini.lib import pynutil fem = pynini.string_file((get_abs_path("data/money/currency_plural_fem.tsv"))) masc = pynini.string_file((get_abs_path("data/money/currency_plural_masc.tsv"))) diff --git a/nemo_text_processing/text_normalization/es/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/es/verbalizers/ordinal.py index 009cdf343..4def8307a 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/ordinal.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, NEMO_SPACE, GraphFst from nemo_text_processing.text_normalization.es.graph_utils import shift_number_gender -from pynini.lib import pynutil class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/es/verbalizers/telephone.py b/nemo_text_processing/text_normalization/es/verbalizers/telephone.py index 9f0b4fcc2..e54fa0d95 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/telephone.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + class TelephoneFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/es/verbalizers/time.py b/nemo_text_processing/text_normalization/es/verbalizers/time.py index 23fa1d180..7ca9b43a5 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/time.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, @@ -21,7 +23,6 @@ insert_space, ) from nemo_text_processing.text_normalization.es.utils import get_abs_path -from pynini.lib import pynutil alt_minutes = pynini.string_file(get_abs_path("data/time/alt_minutes.tsv")) diff --git a/nemo_text_processing/text_normalization/es/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/es/verbalizers/verbalize_final.py index 2840055fa..d2066a1bd 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/verbalize_final.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( GraphFst, delete_extra_space, @@ -24,7 +26,6 @@ from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.text_normalization.es.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/fr/taggers/cardinal.py b/nemo_text_processing/text_normalization/fr/taggers/cardinal.py index b95f6466a..49fda53ed 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/fr/taggers/cardinal.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst, insert_space from nemo_text_processing.text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil class CardinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/fr/taggers/decimals.py b/nemo_text_processing/text_normalization/fr/taggers/decimals.py index 49bf55037..a6e492e01 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/decimals.py +++ b/nemo_text_processing/text_normalization/fr/taggers/decimals.py @@ -14,6 +14,8 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, @@ -22,7 +24,6 @@ insert_space, ) from nemo_text_processing.text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil quantities = pynini.string_file(get_abs_path("data/numbers/quantities.tsv")) digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) diff --git a/nemo_text_processing/text_normalization/fr/taggers/fraction.py b/nemo_text_processing/text_normalization/fr/taggers/fraction.py index f65ee87ed..a8377bf6f 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/fr/taggers/fraction.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst + class FractionFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/fr/taggers/ordinal.py b/nemo_text_processing/text_normalization/fr/taggers/ordinal.py index 4c8c8c73d..d3afb13da 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/fr/taggers/ordinal.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SPACE, GraphFst from nemo_text_processing.text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py index 336f67594..2c2518385 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_WHITE_SPACE, GraphFst, @@ -30,7 +32,6 @@ from nemo_text_processing.text_normalization.fr.taggers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.fr.taggers.word import WordFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/fr/taggers/whitelist.py b/nemo_text_processing/text_normalization/fr/taggers/whitelist.py index d84cc7f02..06dde93cd 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/fr/taggers/whitelist.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space from nemo_text_processing.text_normalization.fr.utils import get_abs_path, load_labels -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/fr/taggers/word.py b/nemo_text_processing/text_normalization/fr/taggers/word.py index a67d9e9a3..c49a6ffaf 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/word.py +++ b/nemo_text_processing/text_normalization/fr/taggers/word.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/fr/verbalizers/cardinal.py index a911d3430..a12dbf520 100644 --- a/nemo_text_processing/text_normalization/fr/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/fr/verbalizers/cardinal.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, insert_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, insert_space + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/decimals.py b/nemo_text_processing/text_normalization/fr/verbalizers/decimals.py index a248c2c1a..af892e6ca 100644 --- a/nemo_text_processing/text_normalization/fr/verbalizers/decimals.py +++ b/nemo_text_processing/text_normalization/fr/verbalizers/decimals.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, @@ -19,7 +21,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class DecimalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/fraction.py b/nemo_text_processing/text_normalization/fr/verbalizers/fraction.py index 8032f80bc..7d2ecb395 100644 --- a/nemo_text_processing/text_normalization/fr/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/fr/verbalizers/fraction.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, @@ -20,7 +22,6 @@ insert_space, ) from nemo_text_processing.text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil class FractionFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/fr/verbalizers/ordinal.py index 039d5a12f..9e0233f6e 100644 --- a/nemo_text_processing/text_normalization/fr/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/fr/verbalizers/ordinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst from nemo_text_processing.text_normalization.fr.utils import get_abs_path -from pynini.lib import pynutil class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/fr/verbalizers/verbalize_final.py index a1690f392..0313f7f5b 100644 --- a/nemo_text_processing/text_normalization/fr/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/fr/verbalizers/verbalize_final.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( GraphFst, delete_extra_space, @@ -24,7 +26,6 @@ from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.text_normalization.fr.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/hu/graph_utils.py b/nemo_text_processing/text_normalization/hu/graph_utils.py index 971b8bdd6..be7c157fd 100644 --- a/nemo_text_processing/text_normalization/hu/graph_utils.py +++ b/nemo_text_processing/text_normalization/hu/graph_utils.py @@ -14,9 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import delete_space, insert_space from pynini.lib import byte +from nemo_text_processing.text_normalization.en.graph_utils import delete_space, insert_space + _ALPHA_UPPER = "AÁBCDEÉFGHIÍJKLMNOÓÖŐPQRSTUÚÜŰVWXYZ" _ALPHA_LOWER = "aábcdeéfghiíjklmnoóöőpqrstuúüűvwxyz" _VOWELS = "AÁEÉIÍOÓÖŐUÚÜŰaáeéiíoóöőuúüű" diff --git a/nemo_text_processing/text_normalization/hu/taggers/cardinal.py b/nemo_text_processing/text_normalization/hu/taggers/cardinal.py index 91efa3085..c20a3d27b 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hu/taggers/cardinal.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, @@ -24,7 +26,6 @@ ) from nemo_text_processing.text_normalization.hu.graph_utils import HU_ALPHA from nemo_text_processing.text_normalization.hu.utils import get_abs_path -from pynini.lib import pynutil def make_million(word: str, hundreds: 'pynini.FstLike', deterministic=False): diff --git a/nemo_text_processing/text_normalization/hu/taggers/date.py b/nemo_text_processing/text_normalization/hu/taggers/date.py index efa5e44c4..0cfddd652 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/date.py +++ b/nemo_text_processing/text_normalization/hu/taggers/date.py @@ -14,10 +14,11 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_DIGIT, NEMO_SPACE, GraphFst from nemo_text_processing.text_normalization.hu.graph_utils import TO_LOWER, TO_UPPER from nemo_text_processing.text_normalization.hu.utils import get_abs_path, load_labels -from pynini.lib import pynutil def get_suffixed_days(labels): diff --git a/nemo_text_processing/text_normalization/hu/taggers/decimal.py b/nemo_text_processing/text_normalization/hu/taggers/decimal.py index 9e30b55a3..a6f819d17 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/hu/taggers/decimal.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst, insert_space from nemo_text_processing.text_normalization.hu.utils import get_abs_path, load_labels, naive_inflector -from pynini.lib import pynutil quantities = load_labels(get_abs_path("data/number/quantities.tsv")) diff --git a/nemo_text_processing/text_normalization/hu/taggers/electronic.py b/nemo_text_processing/text_normalization/hu/taggers/electronic.py index f7d1fe4d5..7381460d9 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/hu/taggers/electronic.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, insert_space from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels -from pynini.lib import pynutil common_domains = [x[0] for x in load_labels(get_abs_path("data/electronic/domain.tsv"))] symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))] diff --git a/nemo_text_processing/text_normalization/hu/taggers/fraction.py b/nemo_text_processing/text_normalization/hu/taggers/fraction.py index 53f630cf2..e67146bc5 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/hu/taggers/fraction.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from nemo_text_processing.text_normalization.hu.utils import get_abs_path -from pynini.lib import pynutil fraction_symbols = pynini.string_file(get_abs_path("data/fractions/fraction_symbols.tsv")) diff --git a/nemo_text_processing/text_normalization/hu/taggers/measure.py b/nemo_text_processing/text_normalization/hu/taggers/measure.py index 8f93d1758..9e5f328fb 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hu/taggers/measure.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_NON_BREAKING_SPACE, @@ -23,7 +25,6 @@ ) from nemo_text_processing.text_normalization.hu.taggers.whitelist import load_inflected from nemo_text_processing.text_normalization.hu.utils import get_abs_path -from pynini.lib import pynutil unit_singular = pynini.string_file(get_abs_path("data/measures/measurements.tsv")) diff --git a/nemo_text_processing/text_normalization/hu/taggers/money.py b/nemo_text_processing/text_normalization/hu/taggers/money.py index 5cfafdba1..f45e60835 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/money.py +++ b/nemo_text_processing/text_normalization/hu/taggers/money.py @@ -15,6 +15,8 @@ import re import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, @@ -30,7 +32,6 @@ load_labels, naive_inflector, ) -from pynini.lib import pynutil min_singular = pynini.string_file(get_abs_path("data/money/currency_minor.tsv")) maj_singular = pynini.string_file((get_abs_path("data/money/currency.tsv"))) diff --git a/nemo_text_processing/text_normalization/hu/taggers/ordinal.py b/nemo_text_processing/text_normalization/hu/taggers/ordinal.py index 4319ee372..634e006e6 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/hu/taggers/ordinal.py @@ -16,10 +16,11 @@ # Russian minimally supervised number grammar. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst from nemo_text_processing.text_normalization.hu.taggers.cardinal import filter_punctuation from nemo_text_processing.text_normalization.hu.utils import get_abs_path -from pynini.lib import pynutil class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/hu/taggers/telephone.py b/nemo_text_processing/text_normalization/hu/taggers/telephone.py index 95ea04643..856353a30 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/hu/taggers/telephone.py @@ -14,6 +14,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_SPACE, GraphFst, @@ -22,7 +24,6 @@ ) from nemo_text_processing.text_normalization.hu.taggers.cardinal import CardinalFst from nemo_text_processing.text_normalization.hu.utils import get_abs_path -from pynini.lib import pynutil class TelephoneFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/hu/taggers/time.py b/nemo_text_processing/text_normalization/hu/taggers/time.py index 138ad97a0..65dc26398 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/time.py +++ b/nemo_text_processing/text_normalization/hu/taggers/time.py @@ -15,6 +15,8 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_SPACE, @@ -28,7 +30,6 @@ load_labels, naive_inflector, ) -from pynini.lib import pynutil QUARTERS = {15: "negyed", 30: "fél", 45: "háromnegyed"} diff --git a/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py index eda4e20ad..bbd82b0b7 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_WHITE_SPACE, GraphFst, @@ -36,7 +38,6 @@ from nemo_text_processing.text_normalization.hu.taggers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.hu.taggers.word import WordFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/hu/taggers/whitelist.py b/nemo_text_processing/text_normalization/hu/taggers/whitelist.py index 95cbfee22..88e7d669f 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/hu/taggers/whitelist.py @@ -13,9 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space from nemo_text_processing.text_normalization.hu.utils import get_abs_path, load_labels, naive_inflector -from pynini.lib import pynutil def load_inflected(filename, input_case, singular_only=False, skip_spaces=True): diff --git a/nemo_text_processing/text_normalization/hu/taggers/word.py b/nemo_text_processing/text_normalization/hu/taggers/word.py index 1d7a6e9b0..31a71ef14 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/word.py +++ b/nemo_text_processing/text_normalization/hu/taggers/word.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/hu/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/hu/verbalizers/cardinal.py index 3e80e3540..9bed76d9e 100644 --- a/nemo_text_processing/text_normalization/hu/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/hu/verbalizers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/hu/verbalizers/date.py b/nemo_text_processing/text_normalization/hu/verbalizers/date.py index bda801b34..047c736b9 100644 --- a/nemo_text_processing/text_normalization/hu/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/hu/verbalizers/date.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_preserve_order from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_preserve_order + class DateFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/hu/verbalizers/decimal.py b/nemo_text_processing/text_normalization/hu/verbalizers/decimal.py index 8e5d8c880..93070b050 100644 --- a/nemo_text_processing/text_normalization/hu/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/hu/verbalizers/decimal.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, @@ -20,7 +22,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class DecimalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/hu/verbalizers/electronic.py b/nemo_text_processing/text_normalization/hu/verbalizers/electronic.py index 491b94dfe..785ac85ee 100644 --- a/nemo_text_processing/text_normalization/hu/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/hu/verbalizers/electronic.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, @@ -20,7 +22,6 @@ insert_space, ) from nemo_text_processing.text_normalization.hu.utils import get_abs_path -from pynini.lib import pynutil digit_no_zero = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))) zero = pynini.invert(pynini.string_file(get_abs_path("data/number/zero.tsv"))) diff --git a/nemo_text_processing/text_normalization/hu/verbalizers/fraction.py b/nemo_text_processing/text_normalization/hu/verbalizers/fraction.py index 2f3019b6e..8b477a5fe 100644 --- a/nemo_text_processing/text_normalization/hu/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/hu/verbalizers/fraction.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, delete_preserve_order, insert_space, ) -from pynini.lib import pynutil class FractionFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/hu/verbalizers/measure.py b/nemo_text_processing/text_normalization/hu/verbalizers/measure.py index b9bcd190e..41f7fb89c 100644 --- a/nemo_text_processing/text_normalization/hu/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/hu/verbalizers/measure.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, delete_extra_space, delete_preserve_order, ) -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/hu/verbalizers/money.py b/nemo_text_processing/text_normalization/hu/verbalizers/money.py index 342abfd09..316fcc7a0 100644 --- a/nemo_text_processing/text_normalization/hu/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/hu/verbalizers/money.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_preserve_order from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_preserve_order + class MoneyFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/hu/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/hu/verbalizers/ordinal.py index 53b0cdf41..eb5369931 100644 --- a/nemo_text_processing/text_normalization/hu/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/hu/verbalizers/ordinal.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class OrdinalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py b/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py index 778407bed..f17f7c36a 100644 --- a/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + class TelephoneFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/hu/verbalizers/time.py b/nemo_text_processing/text_normalization/hu/verbalizers/time.py index a0efd5a4e..9f960b100 100644 --- a/nemo_text_processing/text_normalization/hu/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/hu/verbalizers/time.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, @@ -23,7 +25,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/hu/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/hu/verbalizers/verbalize_final.py index 81c6aa844..86db540d0 100644 --- a/nemo_text_processing/text_normalization/hu/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/hu/verbalizers/verbalize_final.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( GraphFst, delete_extra_space, @@ -24,7 +26,6 @@ from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.text_normalization.hu.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/it/taggers/cardinal.py b/nemo_text_processing/text_normalization/it/taggers/cardinal.py index 59d3a61f9..ecb003775 100644 --- a/nemo_text_processing/text_normalization/it/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/it/taggers/cardinal.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, @@ -25,7 +27,6 @@ ) from nemo_text_processing.text_normalization.es.graph_utils import cardinal_separator from nemo_text_processing.text_normalization.it.utils import get_abs_path -from pynini.lib import pynutil zero = pynini.invert(pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))) diff --git a/nemo_text_processing/text_normalization/it/taggers/decimals.py b/nemo_text_processing/text_normalization/it/taggers/decimals.py index fe1c18e49..4e32855ad 100644 --- a/nemo_text_processing/text_normalization/it/taggers/decimals.py +++ b/nemo_text_processing/text_normalization/it/taggers/decimals.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, @@ -27,7 +29,6 @@ strip_cardinal_apocope, ) from nemo_text_processing.text_normalization.it.utils import get_abs_path -from pynini.lib import pynutil quantities = pynini.string_file(get_abs_path("data/numbers/quantities.tsv")) digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))) diff --git a/nemo_text_processing/text_normalization/it/taggers/electronic.py b/nemo_text_processing/text_normalization/it/taggers/electronic.py index 7bb22c9d0..2504b60cf 100644 --- a/nemo_text_processing/text_normalization/it/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/it/taggers/electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, insert_space from nemo_text_processing.text_normalization.it.utils import get_abs_path, load_labels -from pynini.lib import pynutil common_domains = [x[0] for x in load_labels(get_abs_path("data/electronic/domain.tsv"))] symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))] diff --git a/nemo_text_processing/text_normalization/it/taggers/measure.py b/nemo_text_processing/text_normalization/it/taggers/measure.py index c287ada52..d3591089e 100644 --- a/nemo_text_processing/text_normalization/it/taggers/measure.py +++ b/nemo_text_processing/text_normalization/it/taggers/measure.py @@ -13,6 +13,9 @@ # limitations under the License. import pynini +from pynini.examples import plurals +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, @@ -23,8 +26,6 @@ insert_space, ) from nemo_text_processing.text_normalization.it.utils import get_abs_path -from pynini.examples import plurals -from pynini.lib import pynutil unit_singular = pynini.string_file(get_abs_path("data/measure/measurements.tsv")) suppletive = pynini.string_file(get_abs_path("data/measure/suppletive.tsv")) diff --git a/nemo_text_processing/text_normalization/it/taggers/money.py b/nemo_text_processing/text_normalization/it/taggers/money.py index 70647096e..e8f68c2ac 100644 --- a/nemo_text_processing/text_normalization/it/taggers/money.py +++ b/nemo_text_processing/text_normalization/it/taggers/money.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, @@ -24,7 +26,6 @@ ) from nemo_text_processing.text_normalization.es.graph_utils import decimal_separator from nemo_text_processing.text_normalization.it.utils import get_abs_path, load_labels -from pynini.lib import pynutil maj_singular_labels = load_labels(get_abs_path("data/money/currency_major.tsv")) maj_singular = pynini.string_file((get_abs_path("data/money/currency_major.tsv"))) diff --git a/nemo_text_processing/text_normalization/it/taggers/time.py b/nemo_text_processing/text_normalization/it/taggers/time.py index 841d3d472..351b6f40c 100644 --- a/nemo_text_processing/text_normalization/it/taggers/time.py +++ b/nemo_text_processing/text_normalization/it/taggers/time.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, insert_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, insert_space + class TimeFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py index 18be3e5d7..ffb9d4e40 100644 --- a/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_WHITE_SPACE, GraphFst, @@ -32,7 +34,6 @@ from nemo_text_processing.text_normalization.it.taggers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.it.taggers.word import WordFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/it/taggers/whitelist.py b/nemo_text_processing/text_normalization/it/taggers/whitelist.py index daade2828..c2f5c17db 100644 --- a/nemo_text_processing/text_normalization/it/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/it/taggers/whitelist.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space from nemo_text_processing.text_normalization.it.utils import get_abs_path, load_labels -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/it/taggers/word.py b/nemo_text_processing/text_normalization/it/taggers/word.py index de3f4b7d3..b6746f79d 100644 --- a/nemo_text_processing/text_normalization/it/taggers/word.py +++ b/nemo_text_processing/text_normalization/it/taggers/word.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/it/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/it/verbalizers/cardinal.py index 30f7b4bcb..0e2b3d8b6 100644 --- a/nemo_text_processing/text_normalization/it/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/it/verbalizers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/it/verbalizers/decimal.py b/nemo_text_processing/text_normalization/it/verbalizers/decimal.py index aa527c9f7..568361603 100644 --- a/nemo_text_processing/text_normalization/it/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/it/verbalizers/decimal.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, @@ -20,7 +22,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class DecimalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/it/verbalizers/electronic.py b/nemo_text_processing/text_normalization/it/verbalizers/electronic.py index 4925ee5ba..5a27efbb5 100644 --- a/nemo_text_processing/text_normalization/it/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/it/verbalizers/electronic.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, @@ -21,7 +23,6 @@ insert_space, ) from nemo_text_processing.text_normalization.it.utils import get_abs_path -from pynini.lib import pynutil digit_no_zero = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))) zero = pynini.invert(pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) diff --git a/nemo_text_processing/text_normalization/it/verbalizers/measure.py b/nemo_text_processing/text_normalization/it/verbalizers/measure.py index 6e93f074c..93fa50500 100644 --- a/nemo_text_processing/text_normalization/it/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/it/verbalizers/measure.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, delete_extra_space, delete_preserve_order, ) -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/it/verbalizers/money.py b/nemo_text_processing/text_normalization/it/verbalizers/money.py index 960a1d927..ba9687bd5 100644 --- a/nemo_text_processing/text_normalization/it/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/it/verbalizers/money.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, @@ -22,7 +24,6 @@ ) from nemo_text_processing.text_normalization.es.graph_utils import shift_cardinal_gender, strip_cardinal_apocope from nemo_text_processing.text_normalization.it.utils import get_abs_path -from pynini.lib import pynutil fem = pynini.string_file((get_abs_path("data/money/currency_plural_fem.tsv"))) masc = pynini.string_file((get_abs_path("data/money/currency_plural_masc.tsv"))) diff --git a/nemo_text_processing/text_normalization/it/verbalizers/time.py b/nemo_text_processing/text_normalization/it/verbalizers/time.py index e08ec0712..6f098f152 100644 --- a/nemo_text_processing/text_normalization/it/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/it/verbalizers/time.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, delete_preserve_order from nemo_text_processing.text_normalization.it.utils import get_abs_path -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/it/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/it/verbalizers/verbalize_final.py index 4bd6090c4..d92c79848 100644 --- a/nemo_text_processing/text_normalization/it/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/it/verbalizers/verbalize_final.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( GraphFst, delete_extra_space, @@ -24,7 +26,6 @@ from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.text_normalization.it.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 092f619c0..ad284e871 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -29,6 +29,10 @@ import regex import tqdm from joblib import Parallel, delayed +from pynini.lib.rewrite import top_rewrite +from sacremoses import MosesDetokenizer +from tqdm import tqdm + from nemo_text_processing.text_normalization.data_loader_utils import ( load_file, post_process_punct, @@ -38,9 +42,6 @@ from nemo_text_processing.text_normalization.preprocessing_utils import additional_split from nemo_text_processing.text_normalization.token_parser import PRESERVE_ORDER_KEY, TokenParser from nemo_text_processing.utils.logging import logger -from pynini.lib.rewrite import top_rewrite -from sacremoses import MosesDetokenizer -from tqdm import tqdm # this is to handle long input sys.setrecursionlimit(3000) @@ -117,8 +118,8 @@ def __init__( self.post_processor = None if lang == "en": - from nemo_text_processing.text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst from nemo_text_processing.text_normalization.en.verbalizers.post_processing import PostProcessingFst + from nemo_text_processing.text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst if post_process: self.post_processor = PostProcessingFst(cache_dir=cache_dir, overwrite_cache=overwrite_cache) diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py index 55877afbb..749041eb1 100644 --- a/nemo_text_processing/text_normalization/normalize_with_audio.py +++ b/nemo_text_processing/text_normalization/normalize_with_audio.py @@ -20,12 +20,12 @@ import editdistance import pynini +from pynini.lib import rewrite + from nemo_text_processing.text_normalization.data_loader_utils import post_process_punct, pre_process from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.utils_audio_based import get_alignment from nemo_text_processing.utils.logging import logger -from pynini.lib import rewrite - """ The script provides multiple normalization options and chooses the best one that minimizes CER of the ASR output diff --git a/nemo_text_processing/text_normalization/ru/alphabet.py b/nemo_text_processing/text_normalization/ru/alphabet.py index 3df59f468..b18f494dd 100644 --- a/nemo_text_processing/text_normalization/ru/alphabet.py +++ b/nemo_text_processing/text_normalization/ru/alphabet.py @@ -17,6 +17,7 @@ # Russian minimally supervised number grammar. import pynini + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NON_BREAKING_SPACE, NEMO_SPACE from nemo_text_processing.text_normalization.ru.utils import get_abs_path diff --git a/nemo_text_processing/text_normalization/ru/taggers/cardinal.py b/nemo_text_processing/text_normalization/ru/taggers/cardinal.py index a36500560..d0bc8cc07 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/ru/taggers/cardinal.py @@ -17,6 +17,8 @@ # Russian minimally supervised number grammar. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, @@ -26,7 +28,6 @@ ) from nemo_text_processing.text_normalization.ru.alphabet import RU_ALPHA, TO_CYRILLIC from nemo_text_processing.text_normalization.ru.utils import get_abs_path -from pynini.lib import pynutil class CardinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ru/taggers/date.py b/nemo_text_processing/text_normalization/ru/taggers/date.py index d4ce6a646..dd3872e2f 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/date.py +++ b/nemo_text_processing/text_normalization/ru/taggers/date.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_NOT_QUOTE, @@ -23,7 +25,6 @@ insert_space, ) from nemo_text_processing.text_normalization.ru.utils import get_abs_path -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ru/taggers/decimals.py b/nemo_text_processing/text_normalization/ru/taggers/decimals.py index de29513a3..29c208777 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/decimals.py +++ b/nemo_text_processing/text_normalization/ru/taggers/decimals.py @@ -15,10 +15,11 @@ from collections import defaultdict import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, NEMO_SPACE, GraphFst, insert_space from nemo_text_processing.text_normalization.en.utils import load_labels from nemo_text_processing.text_normalization.ru.utils import get_abs_path -from pynini.lib import pynutil delete_space = pynutil.delete(" ") diff --git a/nemo_text_processing/text_normalization/ru/taggers/electronic.py b/nemo_text_processing/text_normalization/ru/taggers/electronic.py index 22530a28b..f0a5e7bcd 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/ru/taggers/electronic.py @@ -14,6 +14,8 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, @@ -24,7 +26,6 @@ ) from nemo_text_processing.text_normalization.ru.alphabet import RU_ALPHA, TO_CYRILLIC from nemo_text_processing.text_normalization.ru.utils import get_abs_path -from pynini.lib import pynutil class ElectronicFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ru/taggers/measure.py b/nemo_text_processing/text_normalization/ru/taggers/measure.py index a4c60f49c..86c92748f 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/measure.py +++ b/nemo_text_processing/text_normalization/ru/taggers/measure.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NON_BREAKING_SPACE, NEMO_NOT_QUOTE, @@ -22,7 +24,6 @@ ) from nemo_text_processing.text_normalization.ru.alphabet import RU_ALPHA from nemo_text_processing.text_normalization.ru.utils import get_abs_path -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ru/taggers/money.py b/nemo_text_processing/text_normalization/ru/taggers/money.py index c08723ae8..4adcd76fd 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/money.py +++ b/nemo_text_processing/text_normalization/ru/taggers/money.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, NEMO_SPACE, GraphFst from nemo_text_processing.text_normalization.ru.utils import get_abs_path -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ru/taggers/number_names.py b/nemo_text_processing/text_normalization/ru/taggers/number_names.py index 048b5f43e..4ee630448 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/number_names.py +++ b/nemo_text_processing/text_normalization/ru/taggers/number_names.py @@ -26,9 +26,10 @@ # acceptor (G). import pynini +from pynini.lib import pynutil, rewrite + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA from nemo_text_processing.text_normalization.ru.utils import get_abs_path, load_labels -from pynini.lib import pynutil, rewrite def get_number_names(): diff --git a/nemo_text_processing/text_normalization/ru/taggers/ordinal.py b/nemo_text_processing/text_normalization/ru/taggers/ordinal.py index 9a09e4063..09cd57d33 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/ru/taggers/ordinal.py @@ -17,9 +17,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst from nemo_text_processing.text_normalization.ru.utils import get_abs_path -from pynini.lib import pynutil class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ru/taggers/telephone.py b/nemo_text_processing/text_normalization/ru/taggers/telephone.py index 142242e8a..d2b3d508c 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/ru/taggers/telephone.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, delete_space, insert_space from nemo_text_processing.text_normalization.ru.alphabet import RU_ALPHA_OR_SPACE -from pynini.lib import pynutil class TelephoneFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ru/taggers/time.py b/nemo_text_processing/text_normalization/ru/taggers/time.py index bb97f36ea..4b3f40560 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/time.py +++ b/nemo_text_processing/text_normalization/ru/taggers/time.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from nemo_text_processing.text_normalization.ru.utils import get_abs_path -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ru/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ru/taggers/tokenize_and_classify.py index 3e9911b92..53f2b41f5 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/ru/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( GraphFst, delete_extra_space, @@ -35,7 +37,6 @@ from nemo_text_processing.text_normalization.ru.taggers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.ru.taggers.word import WordFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ru/taggers/whitelist.py b/nemo_text_processing/text_normalization/ru/taggers/whitelist.py index 851829208..e4ca3a0c5 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/ru/taggers/whitelist.py @@ -13,10 +13,11 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, convert_space from nemo_text_processing.text_normalization.ru.alphabet import RU_ALPHA, TO_CYRILLIC from nemo_text_processing.text_normalization.ru.utils import get_abs_path, load_labels -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ru/taggers/word.py b/nemo_text_processing/text_normalization/ru/taggers/word.py index 16743d28b..7e0df2a95 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/word.py +++ b/nemo_text_processing/text_normalization/ru/taggers/word.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/ru/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/ru/verbalizers/cardinal.py index 4bff9ed3f..7be8d9727 100644 --- a/nemo_text_processing/text_normalization/ru/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/ru/verbalizers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/ru/verbalizers/date.py b/nemo_text_processing/text_normalization/ru/verbalizers/date.py index 52a69b0c9..82ba1b35f 100644 --- a/nemo_text_processing/text_normalization/ru/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/ru/verbalizers/date.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from nemo_text_processing.text_normalization.ru.alphabet import RU_ALPHA -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ru/verbalizers/decimal.py b/nemo_text_processing/text_normalization/ru/verbalizers/decimal.py index c3006fe3d..a2900bd57 100644 --- a/nemo_text_processing/text_normalization/ru/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/ru/verbalizers/decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class DecimalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/ru/verbalizers/electronic.py b/nemo_text_processing/text_normalization/ru/verbalizers/electronic.py index 5bbfe008d..8440150c5 100644 --- a/nemo_text_processing/text_normalization/ru/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/ru/verbalizers/electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from nemo_text_processing.text_normalization.ru.alphabet import RU_ALPHA -from pynini.lib import pynutil class ElectronicFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ru/verbalizers/measure.py b/nemo_text_processing/text_normalization/ru/verbalizers/measure.py index 27c95b9ee..ad2e85bf5 100644 --- a/nemo_text_processing/text_normalization/ru/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/ru/verbalizers/measure.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NON_BREAKING_SPACE, NEMO_SPACE, @@ -20,7 +22,6 @@ delete_space, ) from nemo_text_processing.text_normalization.ru.alphabet import RU_ALPHA -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ru/verbalizers/money.py b/nemo_text_processing/text_normalization/ru/verbalizers/money.py index 3f5a5e936..02b903407 100644 --- a/nemo_text_processing/text_normalization/ru/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/ru/verbalizers/money.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from nemo_text_processing.text_normalization.ru.alphabet import RU_ALPHA -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ru/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/ru/verbalizers/ordinal.py index 00700d9ec..8f4288465 100644 --- a/nemo_text_processing/text_normalization/ru/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/ru/verbalizers/ordinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class OrdinalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/ru/verbalizers/telephone.py b/nemo_text_processing/text_normalization/ru/verbalizers/telephone.py index bbd2d29f0..fac111ec8 100644 --- a/nemo_text_processing/text_normalization/ru/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/ru/verbalizers/telephone.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from nemo_text_processing.text_normalization.ru.alphabet import RU_ALPHA -from pynini.lib import pynutil class TelephoneFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/ru/verbalizers/time.py b/nemo_text_processing/text_normalization/ru/verbalizers/time.py index 4f72879d2..9cbdb0e21 100644 --- a/nemo_text_processing/text_normalization/ru/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/ru/verbalizers/time.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + class TimeFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/ru/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ru/verbalizers/verbalize_final.py index 8ac49c4ee..8d92e3efe 100644 --- a/nemo_text_processing/text_normalization/ru/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/ru/verbalizers/verbalize_final.py @@ -14,6 +14,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( GraphFst, delete_extra_space, @@ -23,7 +25,6 @@ from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.text_normalization.ru.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/run_evaluate.py b/nemo_text_processing/text_normalization/run_evaluate.py index 8204eccbf..f64771265 100644 --- a/nemo_text_processing/text_normalization/run_evaluate.py +++ b/nemo_text_processing/text_normalization/run_evaluate.py @@ -23,7 +23,6 @@ ) from nemo_text_processing.text_normalization.normalize import Normalizer - ''' Runs Evaluation on data in the format of : \t\t<`self` if trivial class or normalized text> like the Google text normalization data https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish diff --git a/nemo_text_processing/text_normalization/sv/graph_utils.py b/nemo_text_processing/text_normalization/sv/graph_utils.py index 001addc7c..b229ca2fa 100644 --- a/nemo_text_processing/text_normalization/sv/graph_utils.py +++ b/nemo_text_processing/text_normalization/sv/graph_utils.py @@ -14,9 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import delete_space, insert_space from pynini.lib import byte, pynutil +from nemo_text_processing.text_normalization.en.graph_utils import delete_space, insert_space + from .utils import get_abs_path, load_labels _ALPHA_UPPER = "ABCDEFGHIJKLMNOPQRSTUVWXYZÅÄÖÜÉ" diff --git a/nemo_text_processing/text_normalization/sv/taggers/abbreviation.py b/nemo_text_processing/text_normalization/sv/taggers/abbreviation.py index 5aac4cd13..a5a5b8ede 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/abbreviation.py +++ b/nemo_text_processing/text_normalization/sv/taggers/abbreviation.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, insert_space from nemo_text_processing.text_normalization.sv.graph_utils import SV_UPPER -from pynini.lib import pynutil class AbbreviationFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/sv/taggers/cardinal.py b/nemo_text_processing/text_normalization/sv/taggers/cardinal.py index 7f06fcd5d..021e652bd 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/sv/taggers/cardinal.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, @@ -24,7 +26,6 @@ ) from nemo_text_processing.text_normalization.sv.graph_utils import SV_ALPHA from nemo_text_processing.text_normalization.sv.utils import get_abs_path -from pynini.lib import pynutil def make_million(number: str, non_zero_no_one: 'pynini.FstLike', deterministic: bool = True) -> 'pynini.FstLike': diff --git a/nemo_text_processing/text_normalization/sv/taggers/date.py b/nemo_text_processing/text_normalization/sv/taggers/date.py index ce2bc8f74..58e3d0d3e 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/date.py +++ b/nemo_text_processing/text_normalization/sv/taggers/date.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_SPACE, @@ -22,7 +24,6 @@ ) from nemo_text_processing.text_normalization.sv.graph_utils import SV_ALPHA from nemo_text_processing.text_normalization.sv.utils import get_abs_path -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/sv/taggers/decimal.py b/nemo_text_processing/text_normalization/sv/taggers/decimal.py index a63b939fb..2ac0cecc1 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/sv/taggers/decimal.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst from nemo_text_processing.text_normalization.sv.utils import get_abs_path -from pynini.lib import pynutil def get_quantity( diff --git a/nemo_text_processing/text_normalization/sv/taggers/electronic.py b/nemo_text_processing/text_normalization/sv/taggers/electronic.py index f0896edf8..75a24b205 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/sv/taggers/electronic.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, insert_space from nemo_text_processing.text_normalization.sv.utils import get_abs_path, load_labels -from pynini.lib import pynutil class ElectronicFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/sv/taggers/fraction.py b/nemo_text_processing/text_normalization/sv/taggers/fraction.py index a53d723b8..69ab07447 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/sv/taggers/fraction.py @@ -13,10 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst from nemo_text_processing.text_normalization.sv.graph_utils import ensure_space from nemo_text_processing.text_normalization.sv.utils import get_abs_path -from pynini.lib import pynutil class FractionFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/sv/taggers/measure.py b/nemo_text_processing/text_normalization/sv/taggers/measure.py index 83dcf9b0a..e114e9e6d 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/measure.py +++ b/nemo_text_processing/text_normalization/sv/taggers/measure.py @@ -14,6 +14,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NON_BREAKING_SPACE, NEMO_SIGMA, @@ -24,7 +26,6 @@ ) from nemo_text_processing.text_normalization.sv.graph_utils import SV_ALPHA, TO_LOWER from nemo_text_processing.text_normalization.sv.utils import get_abs_path -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/sv/taggers/money.py b/nemo_text_processing/text_normalization/sv/taggers/money.py index 37bd8f9e4..16858cc5c 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/money.py +++ b/nemo_text_processing/text_normalization/sv/taggers/money.py @@ -14,6 +14,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, @@ -24,7 +26,6 @@ ) from nemo_text_processing.text_normalization.sv.graph_utils import SV_ALPHA, ensure_space from nemo_text_processing.text_normalization.sv.utils import get_abs_path, load_labels -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/sv/taggers/ordinal.py b/nemo_text_processing/text_normalization/sv/taggers/ordinal.py index 59dbe528f..7cb62517f 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/sv/taggers/ordinal.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, @@ -25,7 +27,6 @@ ) from nemo_text_processing.text_normalization.sv.taggers.cardinal import filter_punctuation, make_million from nemo_text_processing.text_normalization.sv.utils import get_abs_path -from pynini.lib import pynutil class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/sv/taggers/telephone.py b/nemo_text_processing/text_normalization/sv/taggers/telephone.py index 7d85072ef..4b37d28de 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/sv/taggers/telephone.py @@ -14,6 +14,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_SPACE, @@ -25,7 +27,6 @@ from nemo_text_processing.text_normalization.sv.graph_utils import ensure_space from nemo_text_processing.text_normalization.sv.taggers.cardinal import CardinalFst from nemo_text_processing.text_normalization.sv.utils import get_abs_path -from pynini.lib import pynutil class TelephoneFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/sv/taggers/time.py b/nemo_text_processing/text_normalization/sv/taggers/time.py index d89182bc4..676e78592 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/time.py +++ b/nemo_text_processing/text_normalization/sv/taggers/time.py @@ -15,6 +15,8 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, @@ -26,7 +28,6 @@ ) from nemo_text_processing.text_normalization.sv.graph_utils import ensure_space from nemo_text_processing.text_normalization.sv.utils import get_abs_path, load_labels -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify.py index eab5e449f..ff2495524 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify.py @@ -16,6 +16,8 @@ import time import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_WHITE_SPACE, GraphFst, @@ -38,7 +40,6 @@ from nemo_text_processing.text_normalization.sv.taggers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.sv.taggers.word import WordFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify_with_audio.py b/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify_with_audio.py index e6488365b..4470a4a73 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify_with_audio.py +++ b/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify_with_audio.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, @@ -52,7 +54,6 @@ from nemo_text_processing.text_normalization.sv.verbalizers.telephone import TelephoneFst as vTelephoneFst from nemo_text_processing.text_normalization.sv.verbalizers.time import TimeFst as vTimeFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/sv/taggers/whitelist.py b/nemo_text_processing/text_normalization/sv/taggers/whitelist.py index 1c3569b59..e8bf8151c 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/sv/taggers/whitelist.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space from nemo_text_processing.text_normalization.sv.utils import get_abs_path, load_labels -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/sv/taggers/word.py b/nemo_text_processing/text_normalization/sv/taggers/word.py index 98e211c19..74ad8a3a3 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/word.py +++ b/nemo_text_processing/text_normalization/sv/taggers/word.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/sv/verbalizers/cardinal.py index 3dec2cfa1..9412c4600 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/date.py b/nemo_text_processing/text_normalization/sv/verbalizers/date.py index 7f45297e6..81ef6658a 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/date.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SPACE, @@ -19,7 +21,6 @@ delete_preserve_order, ) from nemo_text_processing.text_normalization.sv.utils import get_abs_path -from pynini.lib import pynutil era_words = pynini.string_file(get_abs_path("data/dates/era_words.tsv")) diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/decimals.py b/nemo_text_processing/text_normalization/sv/verbalizers/decimals.py index b07a51d57..404b42495 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/decimals.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/decimals.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, @@ -19,7 +21,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class DecimalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/electronic.py b/nemo_text_processing/text_normalization/sv/verbalizers/electronic.py index 562f0c884..0cbbeb793 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/electronic.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, @@ -21,7 +23,6 @@ ) from nemo_text_processing.text_normalization.sv.graph_utils import bos_or_space, eos_or_space from nemo_text_processing.text_normalization.sv.utils import get_abs_path -from pynini.lib import pynutil digit_no_zero = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))) zero = pynini.invert(pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/fraction.py b/nemo_text_processing/text_normalization/sv/verbalizers/fraction.py index 27357ecb1..003ce2e97 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/fraction.py @@ -14,9 +14,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, insert_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, insert_space + class FractionFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/measure.py b/nemo_text_processing/text_normalization/sv/verbalizers/measure.py index 0af155d2b..447150a7a 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/measure.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + class MeasureFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/money.py b/nemo_text_processing/text_normalization/sv/verbalizers/money.py index 30a33ceae..8610dc26b 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/money.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, delete_extra_space, delete_preserve_order, ) -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/sv/verbalizers/ordinal.py index 8a7217b15..ef46cf682 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/ordinal.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class OrdinalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py b/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py index 2eb79a759..af17c6d48 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SPACE, @@ -20,7 +22,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class TelephoneFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/time.py b/nemo_text_processing/text_normalization/sv/verbalizers/time.py index ae4c7b828..c715b3545 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/time.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, @@ -22,7 +24,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/sv/verbalizers/verbalize_final.py index 162c54da3..52cf8de07 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/verbalize_final.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( GraphFst, delete_extra_space, @@ -24,7 +26,6 @@ from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.text_normalization.sv.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/utils_audio_based.py b/nemo_text_processing/text_normalization/utils_audio_based.py index 4c05f985d..2e9626d9e 100644 --- a/nemo_text_processing/text_normalization/utils_audio_based.py +++ b/nemo_text_processing/text_normalization/utils_audio_based.py @@ -15,6 +15,7 @@ from typing import Dict from cdifflib import CSequenceMatcher + from nemo_text_processing.utils.logging import logger MATCH = "match" diff --git a/nemo_text_processing/text_normalization/zh/graph_utils.py b/nemo_text_processing/text_normalization/zh/graph_utils.py index a33dc768f..20e7532b6 100644 --- a/nemo_text_processing/text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/text_normalization/zh/graph_utils.py @@ -18,11 +18,12 @@ from typing import Dict import pynini -from nemo_text_processing.utils.logging import logger from pynini import Far from pynini.export import export from pynini.lib import byte, pynutil, utf8 +from nemo_text_processing.utils.logging import logger + # ghaph_utils is here since importing from en folders will cause import errors # that the data file names have to be the same with what are in the en folder diff --git a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py index 3d750e8b9..3756ba6c8 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.text_normalization.zh.utils import get_abs_path -from pynini.lib import pynutil class CardinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/zh/taggers/date.py b/nemo_text_processing/text_normalization/zh/taggers/date.py index 849ada5a6..92fbfce4d 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/date.py +++ b/nemo_text_processing/text_normalization/zh/taggers/date.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.text_normalization.zh.utils import get_abs_path -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/zh/taggers/decimal.py b/nemo_text_processing/text_normalization/zh/taggers/decimal.py index 3e9b118bd..8228777c7 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/zh/taggers/decimal.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.text_normalization.zh.utils import get_abs_path -from pynini.lib import pynutil def get_quantity(decimal): diff --git a/nemo_text_processing/text_normalization/zh/taggers/fraction.py b/nemo_text_processing/text_normalization/zh/taggers/fraction.py index b0a302a8b..6d68280b5 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/zh/taggers/fraction.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst + class FractionFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/zh/taggers/math_symbol.py b/nemo_text_processing/text_normalization/zh/taggers/math_symbol.py index 239466954..d6ae0be9c 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/math_symbol.py +++ b/nemo_text_processing/text_normalization/zh/taggers/math_symbol.py @@ -12,10 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.text_normalization.zh.taggers.cardinal import CardinalFst from nemo_text_processing.text_normalization.zh.utils import get_abs_path -from pynini.lib import pynutil class MathSymbol(GraphFst): diff --git a/nemo_text_processing/text_normalization/zh/taggers/measure.py b/nemo_text_processing/text_normalization/zh/taggers/measure.py index f4d3c8688..3fa61cffe 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/measure.py +++ b/nemo_text_processing/text_normalization/zh/taggers/measure.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst, insert_space from nemo_text_processing.text_normalization.zh.utils import get_abs_path -from pynini.lib import pynutil class Measure(GraphFst): diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index b637bb2a8..93fa59e61 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.text_normalization.zh.utils import get_abs_path -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/zh/taggers/ordinal.py b/nemo_text_processing/text_normalization/zh/taggers/ordinal.py index 88774edca..258a9068c 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/zh/taggers/ordinal.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst + class OrdinalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py index 57d672dc6..df612fd8d 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py +++ b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_SIGMA, GraphFst from nemo_text_processing.text_normalization.zh.utils import get_abs_path -from pynini.lib import pynutil class PreProcessor(GraphFst): diff --git a/nemo_text_processing/text_normalization/zh/taggers/punctuation.py b/nemo_text_processing/text_normalization/zh/taggers/punctuation.py index 0d24a09fb..cff124834 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/punctuation.py +++ b/nemo_text_processing/text_normalization/zh/taggers/punctuation.py @@ -16,11 +16,12 @@ from unicodedata import category import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, NEMO_SIGMA, GraphFst -from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels from pynini.examples import plurals from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, NEMO_SIGMA, GraphFst +from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels + class PunctuationFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/zh/taggers/time.py b/nemo_text_processing/text_normalization/zh/taggers/time.py index 87a01bc9b..283b8c47b 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/time.py +++ b/nemo_text_processing/text_normalization/zh/taggers/time.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.text_normalization.zh.utils import get_abs_path -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py index 46bc0f2e7..822f3d00f 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.zh.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, @@ -37,7 +39,6 @@ from nemo_text_processing.text_normalization.zh.taggers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.zh.taggers.word import Char from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/zh/taggers/whitelist.py b/nemo_text_processing/text_normalization/zh/taggers/whitelist.py index 3b3c745d6..5b6196102 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/zh/taggers/whitelist.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst, convert_space from nemo_text_processing.text_normalization.zh.utils import get_abs_path, load_labels -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/zh/taggers/word.py b/nemo_text_processing/text_normalization/zh/taggers/word.py index 24851c5b1..776e4afdc 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/word.py +++ b/nemo_text_processing/text_normalization/zh/taggers/word.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_SPACE, GraphFst + class Char(GraphFst): ''' diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/zh/verbalizers/cardinal.py index a63f0756b..0cd9c3193 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/cardinal.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/date.py b/nemo_text_processing/text_normalization/zh/verbalizers/date.py index b968c4b92..86405bcff 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/date.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class DateFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/decimal.py b/nemo_text_processing/text_normalization/zh/verbalizers/decimal.py index 4f1f8980f..05fb2045e 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/decimal.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class DecimalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/fraction.py b/nemo_text_processing/text_normalization/zh/verbalizers/fraction.py index 96aff9492..8207c1a22 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/fraction.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class FractionFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/math_symbol.py b/nemo_text_processing/text_normalization/zh/verbalizers/math_symbol.py index 924f9b5ab..59ef1c31a 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/math_symbol.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/math_symbol.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst + class MathSymbol(GraphFst): ''' diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/measure.py b/nemo_text_processing/text_normalization/zh/verbalizers/measure.py index 19c190248..ff4d0df07 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/measure.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class Measure(GraphFst): ''' diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/money.py b/nemo_text_processing/text_normalization/zh/verbalizers/money.py index a749018fd..9e121bbc6 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/money.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class MoneyFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/zh/verbalizers/ordinal.py index dd9b303c9..0379c06fe 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/ordinal.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class OrdinalFst(GraphFst): """ diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py b/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py index 67b0b1954..36394843c 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini +from pynini.lib import pynutil, utf8 + from nemo_text_processing.text_normalization.zh.graph_utils import ( NEMO_ALPHA, NEMO_DIGIT, @@ -21,7 +23,6 @@ GraphFst, ) from nemo_text_processing.text_normalization.zh.utils import get_abs_path -from pynini.lib import pynutil, utf8 class PostProcessor(GraphFst): diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/time.py b/nemo_text_processing/text_normalization/zh/verbalizers/time.py index fd9c41d55..aa3baf046 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/time.py @@ -14,9 +14,10 @@ import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from nemo_text_processing.text_normalization.zh.utils import get_abs_path -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py index 19a398b1a..da4d64ca0 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini + from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.text_normalization.zh.verbalizers.cardinal import CardinalFst from nemo_text_processing.text_normalization.zh.verbalizers.date import DateFst diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py index 706abb3cd..e4b0927d0 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py @@ -14,10 +14,11 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst, delete_space, generator_main from nemo_text_processing.text_normalization.zh.verbalizers.postprocessor import PostProcessor from nemo_text_processing.text_normalization.zh.verbalizers.verbalize import VerbalizeFst -from pynini.lib import pynutil # from nemo_text_processing.utils.logging import logger diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py index e9780744f..3be84e0a0 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import pynini -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst + class Whitelist(GraphFst): ''' diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/word.py b/nemo_text_processing/text_normalization/zh/verbalizers/word.py index 5f241f476..bdcafef96 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/word.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/word.py @@ -11,9 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst + class Char(GraphFst): ''' diff --git a/requirements/requirements_test.txt b/requirements/requirements_test.txt index d89ed803a..a5f160c41 100644 --- a/requirements/requirements_test.txt +++ b/requirements/requirements_test.txt @@ -1,6 +1,6 @@ black==19.10b0 click==8.0.2 -isort[requirements] < 5 +isort[requirements]>5.1.0,<6.0.0 parameterized pynini==2.1.5 pytest diff --git a/setup.cfg b/setup.cfg index 57d6480e8..5b0dd345a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,7 +32,7 @@ markers = [isort] known_localfolder = nemo,tests -sections = FUTURE,STDLIB,THIRDPARTY,LOCALFOLDER +sections = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER default_section = THIRDPARTY skip = setup.py diff --git a/tests/conftest.py b/tests/conftest.py index 77ee095de..b2216e874 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -218,7 +218,7 @@ def pytest_configure(config): extract_data_from_tar(test_dir, test_data_archive, local_data=config.option.use_local_test_data) # Set cache directory for TN/ITN tests - from .nemo_text_processing.utils import set_cache_dir, set_audio_based_tests + from .nemo_text_processing.utils import set_audio_based_tests, set_cache_dir set_cache_dir(config.option.tn_cache_dir) set_audio_based_tests(config.option.run_audio_based) diff --git a/tests/nemo_text_processing/ar/test_cardinal.py b/tests/nemo_text_processing/ar/test_cardinal.py index f63890166..85e9c7d37 100644 --- a/tests/nemo_text_processing/ar/test_cardinal.py +++ b/tests/nemo_text_processing/ar/test_cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from parameterized import parameterized from tests.nemo_text_processing.utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/ar/test_decimal.py b/tests/nemo_text_processing/ar/test_decimal.py index 50dcb3115..8c4753aee 100644 --- a/tests/nemo_text_processing/ar/test_decimal.py +++ b/tests/nemo_text_processing/ar/test_decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from parameterized import parameterized from tests.nemo_text_processing.utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/ar/test_fraction.py b/tests/nemo_text_processing/ar/test_fraction.py index 67f030be6..6ab278fc4 100644 --- a/tests/nemo_text_processing/ar/test_fraction.py +++ b/tests/nemo_text_processing/ar/test_fraction.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from tests.nemo_text_processing.utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/ar/test_money.py b/tests/nemo_text_processing/ar/test_money.py index 219414dc3..6fe36ba35 100644 --- a/tests/nemo_text_processing/ar/test_money.py +++ b/tests/nemo_text_processing/ar/test_money.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from tests.nemo_text_processing.utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/ar/test_whitelist.py b/tests/nemo_text_processing/ar/test_whitelist.py index d0291703d..29ad1dc6d 100644 --- a/tests/nemo_text_processing/ar/test_whitelist.py +++ b/tests/nemo_text_processing/ar/test_whitelist.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from tests.nemo_text_processing.utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/audio_based_utils/test_audio_based_utils.py b/tests/nemo_text_processing/audio_based_utils/test_audio_based_utils.py index c2c8dbc97..2f9b723ac 100644 --- a/tests/nemo_text_processing/audio_based_utils/test_audio_based_utils.py +++ b/tests/nemo_text_processing/audio_based_utils/test_audio_based_utils.py @@ -13,6 +13,7 @@ # limitations under the License. import pytest + from nemo_text_processing.text_normalization.utils_audio_based import get_alignment diff --git a/tests/nemo_text_processing/de/test_cardinal.py b/tests/nemo_text_processing/de/test_cardinal.py index 1119a72d3..d7fd9a33e 100644 --- a/tests/nemo_text_processing/de/test_cardinal.py +++ b/tests/nemo_text_processing/de/test_cardinal.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/de/test_date.py b/tests/nemo_text_processing/de/test_date.py index e4a0cb8cc..13a8785e7 100644 --- a/tests/nemo_text_processing/de/test_date.py +++ b/tests/nemo_text_processing/de/test_date.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/de/test_decimal.py b/tests/nemo_text_processing/de/test_decimal.py index 149b8ee75..17cdeadfe 100644 --- a/tests/nemo_text_processing/de/test_decimal.py +++ b/tests/nemo_text_processing/de/test_decimal.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/de/test_electronic.py b/tests/nemo_text_processing/de/test_electronic.py index 3400dcc2e..eaf442533 100644 --- a/tests/nemo_text_processing/de/test_electronic.py +++ b/tests/nemo_text_processing/de/test_electronic.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/de/test_fraction.py b/tests/nemo_text_processing/de/test_fraction.py index 2e068bc15..9f7b40e97 100644 --- a/tests/nemo_text_processing/de/test_fraction.py +++ b/tests/nemo_text_processing/de/test_fraction.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/de/test_measure.py b/tests/nemo_text_processing/de/test_measure.py index c6220664b..27adc13a9 100644 --- a/tests/nemo_text_processing/de/test_measure.py +++ b/tests/nemo_text_processing/de/test_measure.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/de/test_money.py b/tests/nemo_text_processing/de/test_money.py index 0989104ad..5124205cf 100644 --- a/tests/nemo_text_processing/de/test_money.py +++ b/tests/nemo_text_processing/de/test_money.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/de/test_normalization_with_audio.py b/tests/nemo_text_processing/de/test_normalization_with_audio.py index 2f04090c6..05493e8e3 100644 --- a/tests/nemo_text_processing/de/test_normalization_with_audio.py +++ b/tests/nemo_text_processing/de/test_normalization_with_audio.py @@ -15,9 +15,10 @@ import logging import pytest -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio + from ..utils import CACHE_DIR, get_test_cases_multiple diff --git a/tests/nemo_text_processing/de/test_ordinal.py b/tests/nemo_text_processing/de/test_ordinal.py index a703564e3..e608922e9 100644 --- a/tests/nemo_text_processing/de/test_ordinal.py +++ b/tests/nemo_text_processing/de/test_ordinal.py @@ -14,9 +14,10 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/de/test_telephone.py b/tests/nemo_text_processing/de/test_telephone.py index bb3d0ce7a..57595fac8 100644 --- a/tests/nemo_text_processing/de/test_telephone.py +++ b/tests/nemo_text_processing/de/test_telephone.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/de/test_time.py b/tests/nemo_text_processing/de/test_time.py index 709c2802c..33768b607 100644 --- a/tests/nemo_text_processing/de/test_time.py +++ b/tests/nemo_text_processing/de/test_time.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/de/test_whitelist.py b/tests/nemo_text_processing/de/test_whitelist.py index d740c966b..9db19cac4 100644 --- a/tests/nemo_text_processing/de/test_whitelist.py +++ b/tests/nemo_text_processing/de/test_whitelist.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/de/test_word.py b/tests/nemo_text_processing/de/test_word.py index 0026e0901..259b79bfb 100644 --- a/tests/nemo_text_processing/de/test_word.py +++ b/tests/nemo_text_processing/de/test_word.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_address.py b/tests/nemo_text_processing/en/test_address.py index d7d25a3e7..c7a3523a0 100644 --- a/tests/nemo_text_processing/en/test_address.py +++ b/tests/nemo_text_processing/en/test_address.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_cardinal.py b/tests/nemo_text_processing/en/test_cardinal.py index a7109414a..1ee3a2a5b 100644 --- a/tests/nemo_text_processing/en/test_cardinal.py +++ b/tests/nemo_text_processing/en/test_cardinal.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_date.py b/tests/nemo_text_processing/en/test_date.py index 390e56e9b..6237d88c5 100644 --- a/tests/nemo_text_processing/en/test_date.py +++ b/tests/nemo_text_processing/en/test_date.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_decimal.py b/tests/nemo_text_processing/en/test_decimal.py index b5e14647d..ff021f72a 100644 --- a/tests/nemo_text_processing/en/test_decimal.py +++ b/tests/nemo_text_processing/en/test_decimal.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_electronic.py b/tests/nemo_text_processing/en/test_electronic.py index 5f14e0e9e..e8640062c 100644 --- a/tests/nemo_text_processing/en/test_electronic.py +++ b/tests/nemo_text_processing/en/test_electronic.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_fraction.py b/tests/nemo_text_processing/en/test_fraction.py index ea7db2cf7..764205591 100644 --- a/tests/nemo_text_processing/en/test_fraction.py +++ b/tests/nemo_text_processing/en/test_fraction.py @@ -14,9 +14,10 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_math.py b/tests/nemo_text_processing/en/test_math.py index 830800145..e2ecdebb8 100644 --- a/tests/nemo_text_processing/en/test_math.py +++ b/tests/nemo_text_processing/en/test_math.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_measure.py b/tests/nemo_text_processing/en/test_measure.py index a17881c41..b03b3ff53 100644 --- a/tests/nemo_text_processing/en/test_measure.py +++ b/tests/nemo_text_processing/en/test_measure.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_money.py b/tests/nemo_text_processing/en/test_money.py index 81a8c99a7..9f1387c51 100644 --- a/tests/nemo_text_processing/en/test_money.py +++ b/tests/nemo_text_processing/en/test_money.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_normalization_with_audio.py b/tests/nemo_text_processing/en/test_normalization_with_audio.py index 2601d52a2..671da172a 100644 --- a/tests/nemo_text_processing/en/test_normalization_with_audio.py +++ b/tests/nemo_text_processing/en/test_normalization_with_audio.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio + from ..utils import CACHE_DIR, get_test_cases_multiple diff --git a/tests/nemo_text_processing/en/test_ordinal.py b/tests/nemo_text_processing/en/test_ordinal.py index 3dc06a19a..6f87a832d 100644 --- a/tests/nemo_text_processing/en/test_ordinal.py +++ b/tests/nemo_text_processing/en/test_ordinal.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_punctuation.py b/tests/nemo_text_processing/en/test_punctuation.py index 941db694d..75ff2e73c 100644 --- a/tests/nemo_text_processing/en/test_punctuation.py +++ b/tests/nemo_text_processing/en/test_punctuation.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_range.py b/tests/nemo_text_processing/en/test_range.py index 3bdb7969f..ac93613be 100644 --- a/tests/nemo_text_processing/en/test_range.py +++ b/tests/nemo_text_processing/en/test_range.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_roman.py b/tests/nemo_text_processing/en/test_roman.py index 2f7149ff9..dc9468fb3 100644 --- a/tests/nemo_text_processing/en/test_roman.py +++ b/tests/nemo_text_processing/en/test_roman.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_serial.py b/tests/nemo_text_processing/en/test_serial.py index 4ff0ed982..aab870abf 100644 --- a/tests/nemo_text_processing/en/test_serial.py +++ b/tests/nemo_text_processing/en/test_serial.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_special_text.py b/tests/nemo_text_processing/en/test_special_text.py index f0ba6bb6f..a461fe703 100644 --- a/tests/nemo_text_processing/en/test_special_text.py +++ b/tests/nemo_text_processing/en/test_special_text.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_telephone.py b/tests/nemo_text_processing/en/test_telephone.py index 765660dd3..253abd4d3 100644 --- a/tests/nemo_text_processing/en/test_telephone.py +++ b/tests/nemo_text_processing/en/test_telephone.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_text_split.py b/tests/nemo_text_processing/en/test_text_split.py index b13a49119..3bbb9aa49 100644 --- a/tests/nemo_text_processing/en/test_text_split.py +++ b/tests/nemo_text_processing/en/test_text_split.py @@ -13,6 +13,7 @@ # limitations under the License. import pytest + from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR diff --git a/tests/nemo_text_processing/en/test_time.py b/tests/nemo_text_processing/en/test_time.py index 4405a65ce..6729b39e9 100644 --- a/tests/nemo_text_processing/en/test_time.py +++ b/tests/nemo_text_processing/en/test_time.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_whitelist.py b/tests/nemo_text_processing/en/test_whitelist.py index 0cb72082e..ce955807a 100644 --- a/tests/nemo_text_processing/en/test_whitelist.py +++ b/tests/nemo_text_processing/en/test_whitelist.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/en/test_word.py b/tests/nemo_text_processing/en/test_word.py index 188cc818e..41587addc 100644 --- a/tests/nemo_text_processing/en/test_word.py +++ b/tests/nemo_text_processing/en/test_word.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/es/test_cardinal.py b/tests/nemo_text_processing/es/test_cardinal.py index 748853db5..cb53c517a 100644 --- a/tests/nemo_text_processing/es/test_cardinal.py +++ b/tests/nemo_text_processing/es/test_cardinal.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/es/test_date.py b/tests/nemo_text_processing/es/test_date.py index 150fc23ed..af1c3d96f 100644 --- a/tests/nemo_text_processing/es/test_date.py +++ b/tests/nemo_text_processing/es/test_date.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/es/test_decimal.py b/tests/nemo_text_processing/es/test_decimal.py index 4e9585011..39edf7066 100644 --- a/tests/nemo_text_processing/es/test_decimal.py +++ b/tests/nemo_text_processing/es/test_decimal.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/es/test_electronic.py b/tests/nemo_text_processing/es/test_electronic.py index 7726e0d9f..d476b79b0 100644 --- a/tests/nemo_text_processing/es/test_electronic.py +++ b/tests/nemo_text_processing/es/test_electronic.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/es/test_fraction.py b/tests/nemo_text_processing/es/test_fraction.py index c0022c377..a189f4689 100644 --- a/tests/nemo_text_processing/es/test_fraction.py +++ b/tests/nemo_text_processing/es/test_fraction.py @@ -14,9 +14,10 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/es/test_measure.py b/tests/nemo_text_processing/es/test_measure.py index 4474ea16e..2e644c8db 100644 --- a/tests/nemo_text_processing/es/test_measure.py +++ b/tests/nemo_text_processing/es/test_measure.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/es/test_money.py b/tests/nemo_text_processing/es/test_money.py index 25f34d810..a0ebb8313 100644 --- a/tests/nemo_text_processing/es/test_money.py +++ b/tests/nemo_text_processing/es/test_money.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/es/test_normalization_with_audio.py b/tests/nemo_text_processing/es/test_normalization_with_audio.py index f16f36084..665738733 100644 --- a/tests/nemo_text_processing/es/test_normalization_with_audio.py +++ b/tests/nemo_text_processing/es/test_normalization_with_audio.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio + from ..utils import CACHE_DIR, get_test_cases_multiple diff --git a/tests/nemo_text_processing/es/test_ordinal.py b/tests/nemo_text_processing/es/test_ordinal.py index 5d98c3512..41741f2de 100644 --- a/tests/nemo_text_processing/es/test_ordinal.py +++ b/tests/nemo_text_processing/es/test_ordinal.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/es/test_telephone.py b/tests/nemo_text_processing/es/test_telephone.py index 5a0159158..489eb0930 100644 --- a/tests/nemo_text_processing/es/test_telephone.py +++ b/tests/nemo_text_processing/es/test_telephone.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/es/test_time.py b/tests/nemo_text_processing/es/test_time.py index 86e1c9893..a55fcba7f 100644 --- a/tests/nemo_text_processing/es/test_time.py +++ b/tests/nemo_text_processing/es/test_time.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/es/test_whitelist.py b/tests/nemo_text_processing/es/test_whitelist.py index f5a58d858..52f27da86 100644 --- a/tests/nemo_text_processing/es/test_whitelist.py +++ b/tests/nemo_text_processing/es/test_whitelist.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/es/test_word.py b/tests/nemo_text_processing/es/test_word.py index 87ab24070..f23e83c05 100644 --- a/tests/nemo_text_processing/es/test_word.py +++ b/tests/nemo_text_processing/es/test_word.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/es_en/test_cardinal.py b/tests/nemo_text_processing/es_en/test_cardinal.py index 02c623fb6..d83c009d5 100644 --- a/tests/nemo_text_processing/es_en/test_cardinal.py +++ b/tests/nemo_text_processing/es_en/test_cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/es_en/test_date.py b/tests/nemo_text_processing/es_en/test_date.py index 0cac5dabd..0136b54d7 100644 --- a/tests/nemo_text_processing/es_en/test_date.py +++ b/tests/nemo_text_processing/es_en/test_date.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/es_en/test_decimal.py b/tests/nemo_text_processing/es_en/test_decimal.py index 01c9fcf4c..d2fbae8c6 100644 --- a/tests/nemo_text_processing/es_en/test_decimal.py +++ b/tests/nemo_text_processing/es_en/test_decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/es_en/test_electronic.py b/tests/nemo_text_processing/es_en/test_electronic.py index d3c4a921b..5dc91c639 100644 --- a/tests/nemo_text_processing/es_en/test_electronic.py +++ b/tests/nemo_text_processing/es_en/test_electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/es_en/test_fraction.py b/tests/nemo_text_processing/es_en/test_fraction.py index c4866bf7a..125d9fa3d 100644 --- a/tests/nemo_text_processing/es_en/test_fraction.py +++ b/tests/nemo_text_processing/es_en/test_fraction.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/es_en/test_measure.py b/tests/nemo_text_processing/es_en/test_measure.py index 56defb8a8..948f54db6 100644 --- a/tests/nemo_text_processing/es_en/test_measure.py +++ b/tests/nemo_text_processing/es_en/test_measure.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/es_en/test_money.py b/tests/nemo_text_processing/es_en/test_money.py index 3d3e75656..6b2496015 100644 --- a/tests/nemo_text_processing/es_en/test_money.py +++ b/tests/nemo_text_processing/es_en/test_money.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/es_en/test_ordinal.py b/tests/nemo_text_processing/es_en/test_ordinal.py index 0b4a9cad7..bef676a3c 100644 --- a/tests/nemo_text_processing/es_en/test_ordinal.py +++ b/tests/nemo_text_processing/es_en/test_ordinal.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/es_en/test_telephone.py b/tests/nemo_text_processing/es_en/test_telephone.py index 4b86eeb94..ec8dba594 100644 --- a/tests/nemo_text_processing/es_en/test_telephone.py +++ b/tests/nemo_text_processing/es_en/test_telephone.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/es_en/test_time.py b/tests/nemo_text_processing/es_en/test_time.py index 1fbba3d90..1d76e5012 100644 --- a/tests/nemo_text_processing/es_en/test_time.py +++ b/tests/nemo_text_processing/es_en/test_time.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/es_en/test_whitelist.py b/tests/nemo_text_processing/es_en/test_whitelist.py index e42b9b179..13924220b 100644 --- a/tests/nemo_text_processing/es_en/test_whitelist.py +++ b/tests/nemo_text_processing/es_en/test_whitelist.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/es_en/test_word.py b/tests/nemo_text_processing/es_en/test_word.py index 3cd465165..273089b90 100644 --- a/tests/nemo_text_processing/es_en/test_word.py +++ b/tests/nemo_text_processing/es_en/test_word.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/fr/test_cardinal.py b/tests/nemo_text_processing/fr/test_cardinal.py index 0899ccef4..4e3353cef 100644 --- a/tests/nemo_text_processing/fr/test_cardinal.py +++ b/tests/nemo_text_processing/fr/test_cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/fr/test_date.py b/tests/nemo_text_processing/fr/test_date.py index 754870153..614ed0e24 100644 --- a/tests/nemo_text_processing/fr/test_date.py +++ b/tests/nemo_text_processing/fr/test_date.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/fr/test_decimal.py b/tests/nemo_text_processing/fr/test_decimal.py index d5891074c..54e8f53c1 100644 --- a/tests/nemo_text_processing/fr/test_decimal.py +++ b/tests/nemo_text_processing/fr/test_decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/fr/test_electronic.py b/tests/nemo_text_processing/fr/test_electronic.py index 2fb86e157..475af6ffd 100644 --- a/tests/nemo_text_processing/fr/test_electronic.py +++ b/tests/nemo_text_processing/fr/test_electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/fr/test_fraction.py b/tests/nemo_text_processing/fr/test_fraction.py index a669d58ca..5b5431b79 100644 --- a/tests/nemo_text_processing/fr/test_fraction.py +++ b/tests/nemo_text_processing/fr/test_fraction.py @@ -14,9 +14,10 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/fr/test_measure.py b/tests/nemo_text_processing/fr/test_measure.py index f48fb663e..884ccc957 100644 --- a/tests/nemo_text_processing/fr/test_measure.py +++ b/tests/nemo_text_processing/fr/test_measure.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/fr/test_money.py b/tests/nemo_text_processing/fr/test_money.py index 818d7b4f7..bba19432e 100644 --- a/tests/nemo_text_processing/fr/test_money.py +++ b/tests/nemo_text_processing/fr/test_money.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/fr/test_ordinal.py b/tests/nemo_text_processing/fr/test_ordinal.py index 584df3440..a5915dcb5 100644 --- a/tests/nemo_text_processing/fr/test_ordinal.py +++ b/tests/nemo_text_processing/fr/test_ordinal.py @@ -14,9 +14,10 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/fr/test_telephone.py b/tests/nemo_text_processing/fr/test_telephone.py index 8b062f3c8..dd24dccfe 100644 --- a/tests/nemo_text_processing/fr/test_telephone.py +++ b/tests/nemo_text_processing/fr/test_telephone.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/fr/test_time.py b/tests/nemo_text_processing/fr/test_time.py index 6dec74c52..f5eee9db2 100644 --- a/tests/nemo_text_processing/fr/test_time.py +++ b/tests/nemo_text_processing/fr/test_time.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/fr/test_whitelist.py b/tests/nemo_text_processing/fr/test_whitelist.py index db9212278..dac398fba 100644 --- a/tests/nemo_text_processing/fr/test_whitelist.py +++ b/tests/nemo_text_processing/fr/test_whitelist.py @@ -14,9 +14,10 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/fr/test_word.py b/tests/nemo_text_processing/fr/test_word.py index 11b5b5791..6d48db4fc 100644 --- a/tests/nemo_text_processing/fr/test_word.py +++ b/tests/nemo_text_processing/fr/test_word.py @@ -14,9 +14,10 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/hu/test_cardinal.py b/tests/nemo_text_processing/hu/test_cardinal.py index ca6ed1a29..2276dbfd4 100644 --- a/tests/nemo_text_processing/hu/test_cardinal.py +++ b/tests/nemo_text_processing/hu/test_cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/hu/test_date.py b/tests/nemo_text_processing/hu/test_date.py index be84c5f6b..6ca267e35 100644 --- a/tests/nemo_text_processing/hu/test_date.py +++ b/tests/nemo_text_processing/hu/test_date.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/hu/test_decimal.py b/tests/nemo_text_processing/hu/test_decimal.py index 6cf69a65f..7bbc0b06f 100644 --- a/tests/nemo_text_processing/hu/test_decimal.py +++ b/tests/nemo_text_processing/hu/test_decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/hu/test_electronic.py b/tests/nemo_text_processing/hu/test_electronic.py index 5d3e47f73..951e2cc4c 100644 --- a/tests/nemo_text_processing/hu/test_electronic.py +++ b/tests/nemo_text_processing/hu/test_electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/hu/test_fraction.py b/tests/nemo_text_processing/hu/test_fraction.py index 0789a0318..6b1477439 100644 --- a/tests/nemo_text_processing/hu/test_fraction.py +++ b/tests/nemo_text_processing/hu/test_fraction.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/hu/test_measure.py b/tests/nemo_text_processing/hu/test_measure.py index 93160a382..fac51c162 100644 --- a/tests/nemo_text_processing/hu/test_measure.py +++ b/tests/nemo_text_processing/hu/test_measure.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/hu/test_money.py b/tests/nemo_text_processing/hu/test_money.py index e06e407c8..cfd8d0b54 100644 --- a/tests/nemo_text_processing/hu/test_money.py +++ b/tests/nemo_text_processing/hu/test_money.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/hu/test_ordinal.py b/tests/nemo_text_processing/hu/test_ordinal.py index b083a59fc..a5fdc593d 100644 --- a/tests/nemo_text_processing/hu/test_ordinal.py +++ b/tests/nemo_text_processing/hu/test_ordinal.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/hu/test_telephone.py b/tests/nemo_text_processing/hu/test_telephone.py index 8c9486557..d674af8b5 100644 --- a/tests/nemo_text_processing/hu/test_telephone.py +++ b/tests/nemo_text_processing/hu/test_telephone.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/hu/test_time.py b/tests/nemo_text_processing/hu/test_time.py index c173c02e1..1c6b56d4b 100644 --- a/tests/nemo_text_processing/hu/test_time.py +++ b/tests/nemo_text_processing/hu/test_time.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/hu/test_whitelist.py b/tests/nemo_text_processing/hu/test_whitelist.py index 1fa2d2ac6..0cab00cc1 100644 --- a/tests/nemo_text_processing/hu/test_whitelist.py +++ b/tests/nemo_text_processing/hu/test_whitelist.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/hu/test_word.py b/tests/nemo_text_processing/hu/test_word.py index d22e8bf11..d11092aaa 100644 --- a/tests/nemo_text_processing/hu/test_word.py +++ b/tests/nemo_text_processing/hu/test_word.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/it/test_cardinal.py b/tests/nemo_text_processing/it/test_cardinal.py index 7837eb3b1..87b94ee36 100644 --- a/tests/nemo_text_processing/it/test_cardinal.py +++ b/tests/nemo_text_processing/it/test_cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/it/test_decimal.py b/tests/nemo_text_processing/it/test_decimal.py index 822ef4827..c37dbcc99 100644 --- a/tests/nemo_text_processing/it/test_decimal.py +++ b/tests/nemo_text_processing/it/test_decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/it/test_electronic.py b/tests/nemo_text_processing/it/test_electronic.py index d8e116d4c..bf9390883 100644 --- a/tests/nemo_text_processing/it/test_electronic.py +++ b/tests/nemo_text_processing/it/test_electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/it/test_measure.py b/tests/nemo_text_processing/it/test_measure.py index 85f9bc0d3..4287e5b7d 100644 --- a/tests/nemo_text_processing/it/test_measure.py +++ b/tests/nemo_text_processing/it/test_measure.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/it/test_money.py b/tests/nemo_text_processing/it/test_money.py index a65cd2528..e98a1e942 100644 --- a/tests/nemo_text_processing/it/test_money.py +++ b/tests/nemo_text_processing/it/test_money.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/it/test_time.py b/tests/nemo_text_processing/it/test_time.py index 840d2a9b0..8bb0eb1ab 100644 --- a/tests/nemo_text_processing/it/test_time.py +++ b/tests/nemo_text_processing/it/test_time.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/it/test_whitelist.py b/tests/nemo_text_processing/it/test_whitelist.py index d687da7cb..a380062ad 100644 --- a/tests/nemo_text_processing/it/test_whitelist.py +++ b/tests/nemo_text_processing/it/test_whitelist.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/pt/test_cardinal.py b/tests/nemo_text_processing/pt/test_cardinal.py index bfe7d82d0..dafa3e358 100644 --- a/tests/nemo_text_processing/pt/test_cardinal.py +++ b/tests/nemo_text_processing/pt/test_cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/pt/test_date.py b/tests/nemo_text_processing/pt/test_date.py index 88b5a50eb..88ea91a28 100644 --- a/tests/nemo_text_processing/pt/test_date.py +++ b/tests/nemo_text_processing/pt/test_date.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/pt/test_decimal.py b/tests/nemo_text_processing/pt/test_decimal.py index 4fd77295e..afbec329b 100644 --- a/tests/nemo_text_processing/pt/test_decimal.py +++ b/tests/nemo_text_processing/pt/test_decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/pt/test_electronic.py b/tests/nemo_text_processing/pt/test_electronic.py index 9e340471f..bff47d1fe 100644 --- a/tests/nemo_text_processing/pt/test_electronic.py +++ b/tests/nemo_text_processing/pt/test_electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/pt/test_measure.py b/tests/nemo_text_processing/pt/test_measure.py index 892b45962..9dcfc8548 100644 --- a/tests/nemo_text_processing/pt/test_measure.py +++ b/tests/nemo_text_processing/pt/test_measure.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/pt/test_money.py b/tests/nemo_text_processing/pt/test_money.py index 40c682fe9..632bdb458 100644 --- a/tests/nemo_text_processing/pt/test_money.py +++ b/tests/nemo_text_processing/pt/test_money.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/pt/test_ordinal.py b/tests/nemo_text_processing/pt/test_ordinal.py index 19acfbaee..a830e2d21 100644 --- a/tests/nemo_text_processing/pt/test_ordinal.py +++ b/tests/nemo_text_processing/pt/test_ordinal.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/pt/test_telephone.py b/tests/nemo_text_processing/pt/test_telephone.py index 6d36e9db2..e27c47e1c 100644 --- a/tests/nemo_text_processing/pt/test_telephone.py +++ b/tests/nemo_text_processing/pt/test_telephone.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/pt/test_time.py b/tests/nemo_text_processing/pt/test_time.py index 7a556b36b..e43c61ac6 100644 --- a/tests/nemo_text_processing/pt/test_time.py +++ b/tests/nemo_text_processing/pt/test_time.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/pt/test_whitelist.py b/tests/nemo_text_processing/pt/test_whitelist.py index 0f8884b53..399f191da 100644 --- a/tests/nemo_text_processing/pt/test_whitelist.py +++ b/tests/nemo_text_processing/pt/test_whitelist.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/pt/test_word.py b/tests/nemo_text_processing/pt/test_word.py index 2ad54b15e..cd3cc5d88 100644 --- a/tests/nemo_text_processing/pt/test_word.py +++ b/tests/nemo_text_processing/pt/test_word.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/sv/test_cardinal.py b/tests/nemo_text_processing/sv/test_cardinal.py index 99ca5b48b..7f0914ec2 100644 --- a/tests/nemo_text_processing/sv/test_cardinal.py +++ b/tests/nemo_text_processing/sv/test_cardinal.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/sv/test_date.py b/tests/nemo_text_processing/sv/test_date.py index 495150df0..b0a53e46c 100644 --- a/tests/nemo_text_processing/sv/test_date.py +++ b/tests/nemo_text_processing/sv/test_date.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/sv/test_decimal.py b/tests/nemo_text_processing/sv/test_decimal.py index e1d6a6b49..034e3d4d8 100644 --- a/tests/nemo_text_processing/sv/test_decimal.py +++ b/tests/nemo_text_processing/sv/test_decimal.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/sv/test_electronic.py b/tests/nemo_text_processing/sv/test_electronic.py index e9492a063..b2a828847 100644 --- a/tests/nemo_text_processing/sv/test_electronic.py +++ b/tests/nemo_text_processing/sv/test_electronic.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/sv/test_fraction.py b/tests/nemo_text_processing/sv/test_fraction.py index f807a4179..67253f272 100644 --- a/tests/nemo_text_processing/sv/test_fraction.py +++ b/tests/nemo_text_processing/sv/test_fraction.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/sv/test_measure.py b/tests/nemo_text_processing/sv/test_measure.py index 185d407d2..a4b026bbd 100644 --- a/tests/nemo_text_processing/sv/test_measure.py +++ b/tests/nemo_text_processing/sv/test_measure.py @@ -14,9 +14,10 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/sv/test_money.py b/tests/nemo_text_processing/sv/test_money.py index e9e875b99..b5b065b48 100644 --- a/tests/nemo_text_processing/sv/test_money.py +++ b/tests/nemo_text_processing/sv/test_money.py @@ -14,9 +14,10 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/sv/test_normalization_with_audio.py b/tests/nemo_text_processing/sv/test_normalization_with_audio.py index 105a60c60..bd9e88cbc 100644 --- a/tests/nemo_text_processing/sv/test_normalization_with_audio.py +++ b/tests/nemo_text_processing/sv/test_normalization_with_audio.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio + from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, get_test_cases_multiple diff --git a/tests/nemo_text_processing/sv/test_ordinal.py b/tests/nemo_text_processing/sv/test_ordinal.py index ffc6922f0..6cd11a4b0 100644 --- a/tests/nemo_text_processing/sv/test_ordinal.py +++ b/tests/nemo_text_processing/sv/test_ordinal.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/sv/test_telephone.py b/tests/nemo_text_processing/sv/test_telephone.py index 3e806ebd9..40e5c846f 100644 --- a/tests/nemo_text_processing/sv/test_telephone.py +++ b/tests/nemo_text_processing/sv/test_telephone.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/sv/test_time.py b/tests/nemo_text_processing/sv/test_time.py index a281b18e3..feaa637f2 100644 --- a/tests/nemo_text_processing/sv/test_time.py +++ b/tests/nemo_text_processing/sv/test_time.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/sv/test_whitelist.py b/tests/nemo_text_processing/sv/test_whitelist.py index 176744a87..5fba8e99a 100644 --- a/tests/nemo_text_processing/sv/test_whitelist.py +++ b/tests/nemo_text_processing/sv/test_whitelist.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/sv/test_word.py b/tests/nemo_text_processing/sv/test_word.py index 636e7fe3a..c92f148f5 100644 --- a/tests/nemo_text_processing/sv/test_word.py +++ b/tests/nemo_text_processing/sv/test_word.py @@ -13,10 +13,11 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/zh/test_cardinal.py b/tests/nemo_text_processing/zh/test_cardinal.py index d09b71f3d..a8274d398 100644 --- a/tests/nemo_text_processing/zh/test_cardinal.py +++ b/tests/nemo_text_processing/zh/test_cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/zh/test_date.py b/tests/nemo_text_processing/zh/test_date.py index 01d3e038b..1621ce5e5 100644 --- a/tests/nemo_text_processing/zh/test_date.py +++ b/tests/nemo_text_processing/zh/test_date.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/zh/test_decimal.py b/tests/nemo_text_processing/zh/test_decimal.py index db6046b60..1846efa54 100644 --- a/tests/nemo_text_processing/zh/test_decimal.py +++ b/tests/nemo_text_processing/zh/test_decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/zh/test_fraction.py b/tests/nemo_text_processing/zh/test_fraction.py index 264d64d13..9b71f1d06 100644 --- a/tests/nemo_text_processing/zh/test_fraction.py +++ b/tests/nemo_text_processing/zh/test_fraction.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/zh/test_math.py b/tests/nemo_text_processing/zh/test_math.py index e8740aa2d..cf44a5c22 100644 --- a/tests/nemo_text_processing/zh/test_math.py +++ b/tests/nemo_text_processing/zh/test_math.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/zh/test_measure.py b/tests/nemo_text_processing/zh/test_measure.py index 32df28855..fd8d76aa3 100644 --- a/tests/nemo_text_processing/zh/test_measure.py +++ b/tests/nemo_text_processing/zh/test_measure.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/zh/test_money.py b/tests/nemo_text_processing/zh/test_money.py index 3d50ce5fa..8aa2fc320 100644 --- a/tests/nemo_text_processing/zh/test_money.py +++ b/tests/nemo_text_processing/zh/test_money.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/zh/test_ordinal.py b/tests/nemo_text_processing/zh/test_ordinal.py index f8644c31d..409e62964 100644 --- a/tests/nemo_text_processing/zh/test_ordinal.py +++ b/tests/nemo_text_processing/zh/test_ordinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/zh/test_preprocess.py b/tests/nemo_text_processing/zh/test_preprocess.py index f817517b5..34838cc90 100644 --- a/tests/nemo_text_processing/zh/test_preprocess.py +++ b/tests/nemo_text_processing/zh/test_preprocess.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized +from nemo_text_processing.text_normalization.normalize import Normalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/zh/test_time.py b/tests/nemo_text_processing/zh/test_time.py index 9a8e93f26..ed285983b 100644 --- a/tests/nemo_text_processing/zh/test_time.py +++ b/tests/nemo_text_processing/zh/test_time.py @@ -13,9 +13,10 @@ # limitations under the License. import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/zh/test_whitelist.py b/tests/nemo_text_processing/zh/test_whitelist.py index 8b3e871b1..8e6087f53 100644 --- a/tests/nemo_text_processing/zh/test_whitelist.py +++ b/tests/nemo_text_processing/zh/test_whitelist.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/zh/test_word.py b/tests/nemo_text_processing/zh/test_word.py index 5e2e1da45..ddf587857 100644 --- a/tests/nemo_text_processing/zh/test_word.py +++ b/tests/nemo_text_processing/zh/test_word.py @@ -14,9 +14,10 @@ import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 6e87742ef..58eb6a706 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -19,6 +19,7 @@ from argparse import ArgumentParser import pynini + from nemo_text_processing.text_normalization.en.graph_utils import generator_main # This script exports compiled grammars inside nemo_text_processing into OpenFst finite state archive files From d9f749ea4b0eba7bc7ce851eaad2f4e0a1160de0 Mon Sep 17 00:00:00 2001 From: David Sargsyan <66821320+davidks13@users.noreply.github.com> Date: Thu, 15 Feb 2024 22:44:37 +0400 Subject: [PATCH 08/90] Armenian itn (#136) * Added Armenian ITN Signed-off-by: David Sargsyan * Added Armenian ITN Signed-off-by: David Sargsyan * Added Armenian ITN Signed-off-by: David Sargsyan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: David Sargsyan * Added context for tests and fixed CodeQL errors Signed-off-by: David Sargsyan * Revert "Added context for tests and fixed CodeQL errors" This reverts commit 2c804d941963c0be21d3aad07e6cd13568ab747b. Signed-off-by: David Sargsyan * Added context to some test files and fixed CodeQL errors Signed-off-by: David Sargsyan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: David Sargsyan * deleted unnecessary data Signed-off-by: David Sargsyan * translated a few measurements to Armenian Signed-off-by: David Sargsyan * adjusted some things for better readability and maintainer support Signed-off-by: David Sargsyan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed one test case and some issues Signed-off-by: David Sargsyan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: David Sargsyan Co-authored-by: David Sargsyan Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui --- .../inverse_text_normalization/hy/__init__.py | 13 ++ .../hy/data/__init__.py | 13 ++ .../hy/data/currency.tsv | 53 ++++++++ .../hy/data/measurement_dates.tsv | 4 + .../hy/data/measurements.tsv | 48 +++++++ .../hy/data/numbers/digit.tsv | 11 ++ .../hy/data/numbers/digits_no_one.tsv | 10 ++ .../hy/data/numbers/ties.tsv | 10 ++ .../hy/data/ordinals/digit.tsv | 10 ++ .../hy/data/time/hours.tsv | 29 +++++ .../hy/data/time/minute_to.tsv | 59 +++++++++ .../hy/data/time/minutes.tsv | 65 ++++++++++ .../hy/data/time/minutes_to.tsv | 59 +++++++++ .../hy/data/time/to_hour.tsv | 25 ++++ .../hy/data/whitelist.tsv | 7 ++ .../hy/taggers/__init__.py | 13 ++ .../hy/taggers/cardinal.py | 101 +++++++++++++++ .../hy/taggers/decimal.py | 117 ++++++++++++++++++ .../hy/taggers/fraction.py | 44 +++++++ .../hy/taggers/measure.py | 112 +++++++++++++++++ .../hy/taggers/money.py | 64 ++++++++++ .../hy/taggers/ordinal.py | 55 ++++++++ .../hy/taggers/punctuation.py | 36 ++++++ .../hy/taggers/time.py | 76 ++++++++++++ .../hy/taggers/tokenize_and_classify.py | 114 +++++++++++++++++ .../hy/taggers/whitelist.py | 36 ++++++ .../hy/taggers/word.py | 31 +++++ .../inverse_text_normalization/hy/utils.py | 60 +++++++++ .../hy/verbalizers/__init__.py | 13 ++ .../hy/verbalizers/cardinal.py | 42 +++++++ .../hy/verbalizers/decimal.py | 58 +++++++++ .../hy/verbalizers/fraction.py | 43 +++++++ .../hy/verbalizers/measure.py | 63 ++++++++++ .../hy/verbalizers/money.py | 48 +++++++ .../hy/verbalizers/ordinal.py | 43 +++++++ .../hy/verbalizers/time.py | 53 ++++++++ .../hy/verbalizers/verbalize.py | 51 ++++++++ .../hy/verbalizers/verbalize_final.py | 45 +++++++ .../hy/verbalizers/whitelist.py | 38 ++++++ .../hy/verbalizers/word.py | 34 +++++ .../inverse_normalize.py | 7 +- .../run_evaluate.py | 2 +- tests/nemo_text_processing/hy/__init__.py | 13 ++ .../test_cases_cardinal.txt | 23 ++++ .../test_cases_decimal.txt | 14 +++ .../test_cases_fraction.txt | 24 ++++ .../test_cases_measure.txt | 12 ++ .../test_cases_money.txt | 10 ++ .../test_cases_ordinal.txt | 11 ++ .../test_cases_time.txt | 9 ++ .../test_cases_whitelist.txt | 7 ++ .../test_cases_word.txt | 50 ++++++++ .../nemo_text_processing/hy/test_cardinal.py | 32 +++++ tests/nemo_text_processing/hy/test_decimal.py | 31 +++++ .../nemo_text_processing/hy/test_fraction.py | 31 +++++ tests/nemo_text_processing/hy/test_measure.py | 31 +++++ tests/nemo_text_processing/hy/test_money.py | 31 +++++ tests/nemo_text_processing/hy/test_ordinal.py | 31 +++++ ..._sparrowhawk_inverse_text_normalization.sh | 69 +++++++++++ tests/nemo_text_processing/hy/test_time.py | 31 +++++ .../nemo_text_processing/hy/test_whitelist.py | 31 +++++ tests/nemo_text_processing/hy/test_word.py | 31 +++++ .../pynini_export.py | 9 +- 63 files changed, 2343 insertions(+), 3 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/hy/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/data/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/data/currency.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hy/data/measurement_dates.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hy/data/measurements.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hy/data/numbers/digit.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hy/data/numbers/digits_no_one.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hy/data/numbers/ties.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hy/data/ordinals/digit.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hy/data/time/hours.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hy/data/time/minute_to.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hy/data/time/minutes.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hy/data/time/minutes_to.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hy/data/time/to_hour.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hy/data/whitelist.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hy/taggers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/taggers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/taggers/decimal.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/taggers/fraction.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/taggers/measure.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/taggers/money.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/taggers/ordinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/taggers/punctuation.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/taggers/time.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/taggers/tokenize_and_classify.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/taggers/whitelist.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/taggers/word.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/verbalizers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/verbalizers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/verbalizers/decimal.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/verbalizers/fraction.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/verbalizers/measure.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/verbalizers/money.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/verbalizers/time.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/verbalizers/verbalize.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/verbalizers/verbalize_final.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/verbalizers/whitelist.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/verbalizers/word.py create mode 100644 tests/nemo_text_processing/hy/__init__.py create mode 100644 tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_cardinal.txt create mode 100644 tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_decimal.txt create mode 100644 tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_fraction.txt create mode 100644 tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_measure.txt create mode 100644 tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_money.txt create mode 100644 tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_ordinal.txt create mode 100644 tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_time.txt create mode 100644 tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_whitelist.txt create mode 100644 tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_word.txt create mode 100644 tests/nemo_text_processing/hy/test_cardinal.py create mode 100644 tests/nemo_text_processing/hy/test_decimal.py create mode 100644 tests/nemo_text_processing/hy/test_fraction.py create mode 100644 tests/nemo_text_processing/hy/test_measure.py create mode 100644 tests/nemo_text_processing/hy/test_money.py create mode 100644 tests/nemo_text_processing/hy/test_ordinal.py create mode 100755 tests/nemo_text_processing/hy/test_sparrowhawk_inverse_text_normalization.sh create mode 100644 tests/nemo_text_processing/hy/test_time.py create mode 100644 tests/nemo_text_processing/hy/test_whitelist.py create mode 100644 tests/nemo_text_processing/hy/test_word.py diff --git a/nemo_text_processing/inverse_text_normalization/hy/__init__.py b/nemo_text_processing/inverse_text_normalization/hy/__init__.py new file mode 100644 index 000000000..9df65818d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/hy/data/__init__.py b/nemo_text_processing/inverse_text_normalization/hy/data/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/hy/data/currency.tsv b/nemo_text_processing/inverse_text_normalization/hy/data/currency.tsv new file mode 100644 index 000000000..6caf930d0 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/data/currency.tsv @@ -0,0 +1,53 @@ +aed դիրհամ +ARS արգենտինական պեսո +֏ դրամ +֏ հայկական դրամ +a$ ավստրալական դոլար +br բելառուսական ռուբլի +₿ բիթքոին +£ ֆունտ +£ բրիտանական ֆունտ +ca$ կանադական դոլար +元 չինական յեն +元 յեն +kr դանիական կրոն +$ դոլար +€ եվրո +₾ վրացական լարի +₾ լարի +₹ հնդկական ռուփի +₹ ռուփի +﷼ պարսկական ռիալ +﷼ ռիալ +₪ իսրայելական շեկել +₪ շեկել +¥ ճապոնական յեն +¥ յեն +₸ ղազախական տենգե +₸ տենգե +som ղրղզական սոմ +som սոմ +ل.ل լիբանանյան ֆունտ +ل.ل լիբանանյան լիրա +$ մեքսիկական պեսո +nz$ նորզելանդական դոլլր +kr նորվեգական կրոն +zł լեհական զլոտի +zł զլոտի +£ ֆունտ ստերլինգ +£ ֆունտ +₽ ռուսական ռուբլի +₽ ռուբլի +rsd սերբական դինար +s$ սինգապուրի դոլար +₩ կորեական վոն +kr շվեդական կրոն +chf շվեյցարական ֆրանկ +£s սիրիական ֆունտ +₺ թուրքական լիրա +₴ ուկրաինական գրիվնա +$ ամերիկյան դոլար +$ ամն դոլար +som ուզբեկական սոմ +₩ վոն +¥ յեն \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hy/data/measurement_dates.tsv b/nemo_text_processing/inverse_text_normalization/hy/data/measurement_dates.tsv new file mode 100644 index 000000000..da54d562e --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/data/measurement_dates.tsv @@ -0,0 +1,4 @@ +թ. թվական +թթ. թվականներ +դ. դար +դդ. դարեր \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hy/data/measurements.tsv b/nemo_text_processing/inverse_text_normalization/hy/data/measurements.tsv new file mode 100644 index 000000000..a2b355292 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/data/measurements.tsv @@ -0,0 +1,48 @@ +°F ֆարենհայթ +°C ցելսիուս +K կելվին +կմ կիլոմետր +կմ/ժ կիլոմետր ժամ +մ մետր +մ/ժ մետր ժամ +սմ սանտիմետր +մմ միլիմետր +հա հեկտար +մղն մղոն +մ² քառակուսի մետր +մ² մետր քառակուսի +կմ² քառակուսի կիլոմետր +կմ² կիլոմետր քառակուսի +% տոկոս +% տոկոսադրույք +Հց հերց +կՎտ կիլովատտ +կՎտ կիլո վատտ +կՎ/Ժ կիլովատտ ժամ +կՎ/ժ կիլո վատտ ժամ +Վտ/ժ վատտ ժամ +Վտ վատտ +ձ.ու. ձիաուժ +մգ միլիգրամ +կգ կիլոգրամ +Վ վոլտ +ժ ժամ +վ վայրկյան +ր րոպե +մ³ խորանարդ մետր +գ գրամ +տ տոննա +մբ մեգաբայթ +կբ կիլոբայթ +գբ գիգաբայթ +գբ գեգաբայթ +տբ տերաբայթ +տբ տեռաբայթ +կՎ կիլո վոլտ +մՎ մեգա վոլտ +Ա ամպեր +մԱ միլի ամպեր +մվ միլի վայրկյան +դմ դեցիմետր +սմ² քառակուսի սանտիմետր +սմ² քառակուսի սանտիմետր \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hy/data/numbers/digit.tsv b/nemo_text_processing/inverse_text_normalization/hy/data/numbers/digit.tsv new file mode 100644 index 000000000..4a38e1c3b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/data/numbers/digit.tsv @@ -0,0 +1,11 @@ +մեկ 1 +երկու 2 +երկուս 2 +երեք 3 +չորս 4 +հինգ 5 +վեց 6 +յոթ 7 +ութ 8 +ինը 9 +ինն 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hy/data/numbers/digits_no_one.tsv b/nemo_text_processing/inverse_text_normalization/hy/data/numbers/digits_no_one.tsv new file mode 100644 index 000000000..71c2e6fa0 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/data/numbers/digits_no_one.tsv @@ -0,0 +1,10 @@ +երկու 2 +երկուս 2 +երեք 3 +չորս 4 +հինգ 5 +վեց 6 +յոթ 7 +ութ 8 +ինը 9 +ինն 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hy/data/numbers/ties.tsv b/nemo_text_processing/inverse_text_normalization/hy/data/numbers/ties.tsv new file mode 100644 index 000000000..514ac8135 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/data/numbers/ties.tsv @@ -0,0 +1,10 @@ +տասը 1 +տասն 1 +քսան 2 +երեսուն 3 +քառասուն 4 +հիսուն 5 +վաթսուն 6 +յոթանասուն 7 +ութսուն 8 +իննսուն 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hy/data/ordinals/digit.tsv b/nemo_text_processing/inverse_text_normalization/hy/data/ordinals/digit.tsv new file mode 100644 index 000000000..947449eb4 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/data/ordinals/digit.tsv @@ -0,0 +1,10 @@ +զրոերորդ զրո +առաջին մեկ +երկրորդ երկու +երրորդ երեք +չորրորդ չորս +հինգերորդ հինգ +վեցերորդ վեց +յոթերորդ յոթ +ութերորդ ութ +իններորդ ինը \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hy/data/time/hours.tsv b/nemo_text_processing/inverse_text_normalization/hy/data/time/hours.tsv new file mode 100644 index 000000000..5cb60cada --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/data/time/hours.tsv @@ -0,0 +1,29 @@ +զրո 0 +մեկ 1 +երկու 2 +երկուս 2 +երեք 3 +չորս 4 +հինգ 5 +վեց 6 +յոթ 7 +ութ 8 +ինը 9 +տաս 10 +տասն 10 +տասնմեկ 11 +տասներկու 12 +տասներկուս 12 +տասներեք 13 +տասնչորս 14 +տասնհինգ 15 +տասնվեց 16 +տասնյոթ 17 +տասնութ 18 +տասնինը 19 +քսան 20 +քսանմեկ 21 +քսաներկու 22 +քսաներկուս 22 +քսաներեք 23 +քսանչորս 24 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hy/data/time/minute_to.tsv b/nemo_text_processing/inverse_text_normalization/hy/data/time/minute_to.tsv new file mode 100644 index 000000000..edab4d5b0 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/data/time/minute_to.tsv @@ -0,0 +1,59 @@ +1 59 +2 58 +3 57 +4 56 +5 55 +6 54 +7 53 +8 52 +9 51 +10 50 +11 49 +12 48 +13 47 +14 46 +15 45 +16 44 +17 43 +18 42 +19 41 +20 40 +21 39 +22 38 +23 37 +24 36 +25 35 +26 34 +27 33 +28 32 +29 31 +30 30 +31 29 +32 28 +33 27 +34 26 +35 25 +36 24 +37 23 +38 22 +39 21 +40 20 +41 19 +42 18 +43 17 +44 16 +45 15 +46 14 +47 13 +48 12 +49 11 +50 10 +51 9 +52 8 +53 7 +54 6 +55 5 +56 4 +57 3 +58 2 +59 1 diff --git a/nemo_text_processing/inverse_text_normalization/hy/data/time/minutes.tsv b/nemo_text_processing/inverse_text_normalization/hy/data/time/minutes.tsv new file mode 100644 index 000000000..0f31d43bc --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/data/time/minutes.tsv @@ -0,0 +1,65 @@ +մեկ 01 +երկու 02 +երկուս 02 +երեք 03 +չորս 04 +հինգ 05 +վեց 06 +յոթ 07 +ութ 08 +ինը 09 +տասը 10 +տասնմեկ 11 +տասներկու 12 +տասներկուս 12 +տասներեք 13 +տասնչորս 14 +տասնհինգ 15 +տասնվեց 16 +տասնյոթ 17 +տասնութ 18 +տասնինը 19 +քսան 20 +քսանմեկ 21 +քսաներկու 22 +քսաներկուս 22 +քսաներեք 23 +քսանչորս 24 +քսանհինգ 25 +քսանվեց 26 +քսանյոթ 27 +քսանութ 28 +քսանինը 29 +երեսուն 30 +երեսունմեկ 31 +երեսուներկու 32 +երեսուներկուս 32 +երեսուներեք 33 +երեսունչորս 34 +երեսունհինգ 35 +երեսունվեց 36 +երեսունյոթ 37 +երեսունութ 38 +երեսունինը 39 +քառասուն 40 +քառասունմեկ 41 +քառասուներկու 42 +քառասուներկուս 42 +քառասուներեք 43 +քառասունչորս 44 +քառասունհինգ 45 +քառասունվեց 46 +քառասունյոթ 47 +քառասունութ 48 +քառասունինը 49 +հիսուն 50 +հիսունմեկ 51 +հիսուներկու 52 +հիսուներկուս 52 +հիսուներեք 53 +հիսունչորս 54 +հիսունհինգ 55 +հիսունվեց 56 +հիսունյոթ 57 +հիսունութ 58 +հիսունինը 59 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hy/data/time/minutes_to.tsv b/nemo_text_processing/inverse_text_normalization/hy/data/time/minutes_to.tsv new file mode 100644 index 000000000..0837e850a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/data/time/minutes_to.tsv @@ -0,0 +1,59 @@ +01 59 +02 58 +03 57 +04 56 +05 55 +06 54 +07 53 +08 52 +09 51 +10 50 +11 49 +12 48 +13 47 +14 46 +15 45 +16 44 +17 43 +18 42 +19 41 +20 40 +21 39 +22 38 +23 37 +24 36 +25 35 +26 34 +27 33 +28 32 +29 31 +30 30 +31 29 +32 28 +33 27 +34 26 +35 25 +36 24 +37 23 +38 22 +39 21 +40 20 +41 19 +42 18 +43 17 +44 16 +45 15 +46 14 +47 13 +48 12 +49 11 +50 10 +51 09 +52 08 +53 07 +54 06 +55 05 +56 04 +57 03 +58 02 +59 01 diff --git a/nemo_text_processing/inverse_text_normalization/hy/data/time/to_hour.tsv b/nemo_text_processing/inverse_text_normalization/hy/data/time/to_hour.tsv new file mode 100644 index 000000000..a56219579 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/data/time/to_hour.tsv @@ -0,0 +1,25 @@ +1 0 +2 1 +3 2 +4 3 +5 4 +6 5 +7 6 +8 7 +9 8 +10 9 +11 10 +12 11 +13 12 +14 13 +15 14 +16 15 +17 16 +18 17 +19 18 +20 19 +21 20 +22 21 +23 22 +24 23 +0 23 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hy/data/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/hy/data/whitelist.tsv new file mode 100644 index 000000000..0e400c28b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/data/whitelist.tsv @@ -0,0 +1,7 @@ +ս.թ. սույն թվականի +մ.թ.ա. մեր թվարկությունից առաջ +մ.թ. մեր թվարկություն +Ք.ա. քրիստոսից առաջ +Ք.հ. քրիստոսից հետո +Ք.ծ.ա. քրիստոսի ծնունդից առաջ +Ք.ծ.հ. քրիստոսից ծնունդից հետո diff --git a/nemo_text_processing/inverse_text_normalization/hy/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/hy/taggers/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/hy/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/hy/taggers/cardinal.py new file mode 100644 index 000000000..1a74ddc3e --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/taggers/cardinal.py @@ -0,0 +1,101 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hy.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, delete_space + + +class CardinalFst(GraphFst): + """ + Finite state transducer for classifying cardinals + e.g. իննսունյոթ -> cardinal { integer: "97" } } + """ + + def __init__(self): + super().__init__(name="cardinal", kind="classify") + + zero = pynini.string_map([("զրո", "0")]) + digit = (pynini.string_file(get_abs_path("data/numbers/digit.tsv"))) + ( + pynini.closure(pynutil.delete("ն") | pynutil.delete("ի") | pynutil.delete("ին"), 0, 1) + ) + digits_no_one = pynini.string_file(get_abs_path("data/numbers/digits_no_one.tsv")) + graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) + ( + pynini.closure(pynutil.delete("ն") | pynutil.delete("ի") | pynutil.delete("ին"), 0, 1) + ) + graph_digit = digit | pynutil.insert("0") + + graph_ties = graph_ties | pynutil.insert("0") + graph_two_digit_nums = graph_ties + graph_digit + + hundred = pynini.accep("հարյուր") + graph_hundred = pynini.cross("հարյուր", "1") + + graph_hundreds_first_digit = graph_hundred | (digits_no_one + delete_space + pynutil.delete(hundred)) + graph_hundreds = ( + (graph_hundreds_first_digit + delete_space | pynutil.insert("0", weight=0.1)) + + delete_space + + graph_two_digit_nums + ) + + self.graph_hundred_component_at_least_one_none_zero_digit = ( + graph_hundreds @ (pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)).optimize() + ) + + graph_one_thousand = pynini.cross("հազար", "1") + graph_many_thousand = graph_hundreds + delete_space + pynutil.delete("հազար") + graph_thousands = ( + (graph_one_thousand | graph_many_thousand | pynutil.insert("000", weight=0.000000001)) + + delete_space + + graph_hundreds + ) + + millions = pynini.accep("միլիոն") + graph_millions = ( + ((graph_hundreds + delete_space + pynutil.delete(millions)) | pynutil.insert("000", weight=0.1)) + + delete_space + + graph_thousands + ) + + billions = pynini.accep("միլիարդ") + graph_billions = ( + (graph_hundreds + delete_space + pynutil.delete(billions) + delete_space) + | pynutil.insert("000", weight=0.1) + ) + graph_millions + + trillions = pynini.accep("տրիլիոն") + graph_trillions = ( + (graph_hundreds + delete_space + pynutil.delete(trillions) + delete_space) + | pynutil.insert("000", weight=0.1) + ) + graph_billions + + graph = graph_trillions | zero + + delete_leading_zeroes = pynutil.delete(pynini.closure("0")) + stop_at_non_zero = pynini.difference(NEMO_DIGIT, "0") + rest_of_cardinal = pynini.closure(NEMO_DIGIT) + + clean_cardinal = delete_leading_zeroes + stop_at_non_zero + rest_of_cardinal + clean_cardinal = clean_cardinal | "0" + + graph = graph @ clean_cardinal + self.graph_no_exception = graph.optimize() + + final_graph = pynutil.insert("integer: \"") + graph + pynutil.insert("\"") + final_graph = self.add_tokens(final_graph) + + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/hy/taggers/decimal.py new file mode 100644 index 000000000..be52779f5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/taggers/decimal.py @@ -0,0 +1,117 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hy.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_LOWER_CASED, + MIN_NEG_WEIGHT, + NEMO_DIGIT, + NEMO_SIGMA, + TO_LOWER, + GraphFst, + delete_extra_space, + delete_space, +) + + +def get_quantity( + decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstLike', input_case: str = INPUT_LOWER_CASED +) -> 'pynini.FstLike': + """ + Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, + e.g. հինգ միլիոն -> tokens { decimal { integer_part: "5" quantity: "միլիոն" } } + e.g. հինգ ամբողջ յոթ միլիարդ -> tokens { decimal { integer_part: "5" fractional_part: "7" quantity: "միլիարդ" } } + + Args: + decimal: decimal FST + cardinal_up_to_hundred: cardinal FST + input_case: accepting either "lower_cased" or "cased" input. + (input_case is not necessary everything is made for lower_cased input) + TODO add case input support + + """ + numbers = cardinal_up_to_hundred @ ( + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) + ) + + suffix = pynini.union("միլիոն", "միլիարդ", "տրիլիոն") + + res = ( + pynutil.insert("integer_part: \"") + + numbers + + pynutil.insert("\"") + + delete_extra_space + + pynutil.insert("quantity: \"") + + suffix + + pynutil.insert("\"") + ) + res |= decimal + delete_extra_space + pynutil.insert("quantity: \"") + (suffix | "հազար") + pynutil.insert("\"") + return res + + +class DecimalFst(GraphFst): + """ + Finite state transducer for classifying decimal + e.g. հիսուն ու կես տրիլիոն -> decimal { integer_part: "50" fractional_part: "5" quantity: "տրիլիոն" } + e.g. մեկ միլիարդ -> decimal { integer_part: "1" quantity: "միլիարդ" } + Args: + cardinal: CardinalFst + input_case: accepting either "lower_cased" or "cased" input. + TODO add cased input support + """ + + def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): + super().__init__(name="decimal", kind="classify") + + cardinal_graph = cardinal.graph_no_exception + + graph_decimal = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) | pynini.string_map( + [("զրո", "0"), ("կես", "5")] + ) + + graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal + self.only_decimal = graph_decimal.optimize() + + point_first = pynutil.delete("ամբողջ") + point_second = pynutil.delete("ու") + + graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"") + graph_integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + final_graph_wo_sign = ( + pynini.closure((graph_integer | pynini.string_map(["", "0"])) + delete_extra_space, 0, 1) + + (point_first | point_second) + + delete_extra_space + + graph_fractional + ) + final_graph = final_graph_wo_sign + + self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( + final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit, input_case=input_case + ) + + self.final_graph_wo_negative |= pynutil.add_weight( + pynini.compose(TO_LOWER + NEMO_SIGMA, self.final_graph_wo_negative).optimize(), MIN_NEG_WEIGHT + ) + + quantity_graph = get_quantity( + final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit, input_case=input_case + ) + final_graph |= quantity_graph + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/hy/taggers/fraction.py new file mode 100644 index 000000000..a7eba809f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/taggers/fraction.py @@ -0,0 +1,44 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import INPUT_LOWER_CASED, GraphFst, delete_space + + +class FractionFst(GraphFst): + """ + Finite state transducer for classifying fraction + e.g. երկու երրորդ -> tokens { fraction { numerator: "2" denominator: "3" } } + + Args: + input_case: accepting either "lower_cased" or "cased" input. + (input_case is not necessary everything is made for lower_cased input) + TODO add cased input support + """ + + def __init__(self, cardinal: GraphFst, ordinal: GraphFst, input_case: str = INPUT_LOWER_CASED): + super().__init__(name="fraction", kind="classify") + cardinal_graph = cardinal.graph_no_exception + quarter = pynini.string_map([("քառորդ", "4")]) + ordinal_graph = ordinal.graph | quarter + + numerator = pynutil.insert("numerator: \"") + cardinal_graph + pynutil.insert("\"") + denominator = pynutil.insert(" denominator: \"") + ordinal_graph + pynutil.insert("\"") + + final_graph = numerator + delete_space + denominator + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/hy/taggers/measure.py new file mode 100644 index 000000000..a0814c582 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/taggers/measure.py @@ -0,0 +1,112 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hy.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_LOWER_CASED, + NEMO_SIGMA, + TO_LOWER, + GraphFst, + convert_space, + delete_extra_space, +) + + +class MeasureFst(GraphFst): + """ + Finite state transducer for classifying measure + e.g. իննսունյոթ հերց -> measure { cardinal { integer: "97" } units: "Հց" } + + Args: + cardinal: CardinalFst + decimal: DecimalFst + input_case: accepting either "lower_cased" or "cased" input. + (input_case is not necessary everything is made for lower_cased input) + TODO add cased input support + """ + + def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPUT_LOWER_CASED): + super().__init__(name="measure", kind="classify") + + cardinal_graph = cardinal.graph_no_exception + from_to = pynini.string_map([("ից", "")]) + cardinal_graph += pynutil.insert("") | from_to + + casing_graph = pynini.closure(TO_LOWER | NEMO_SIGMA).optimize() + + graph_measurements_unit = pynini.string_file(get_abs_path("data/measurements.tsv")) + ( + pynutil.insert("") | pynutil.insert("ում") | pynutil.insert("ից") + ) + graph_measurements_unit = pynini.invert(graph_measurements_unit) + graph_measurements_unit = pynini.compose(casing_graph, graph_measurements_unit).optimize() + + measurements_unit = convert_space(graph_measurements_unit) + + graph_measurements_dates_unit = pynini.string_file(get_abs_path("data/measurement_dates.tsv")) + + graph_measurements_dates_unit = pynini.invert(graph_measurements_dates_unit) + graph_measurements_dates_unit = pynini.compose(casing_graph, graph_measurements_dates_unit).optimize() + + measurements_dates_unit = convert_space(graph_measurements_dates_unit) + + measurements_unit = pynutil.insert("units: \"") + measurements_unit + pynutil.insert("\"") + + measurements_dates_unit = pynutil.insert("units: \"") + measurements_dates_unit + pynutil.insert("\"") + + subgraph_decimal = ( + pynutil.insert("decimal { ") + + decimal.final_graph_wo_negative + + pynutil.insert(" }") + + delete_extra_space + + measurements_unit + ) + subgraph_cardinal = ( + pynutil.insert("cardinal { ") + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" }") + + delete_extra_space + + measurements_unit + ) + subgraph_cardinal_dates = ( + (measurements_dates_unit + delete_extra_space | pynutil.insert("")) + + pynutil.insert("cardinal { ") + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" }") + + delete_extra_space + + measurements_dates_unit + ) + subgraph_cardinal_dates |= ( + (measurements_dates_unit + delete_extra_space | pynutil.insert("")) + + pynutil.insert("cardinal { ") + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert('-') + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" }") + + delete_extra_space + + measurements_dates_unit + ) + + final_graph = subgraph_decimal | subgraph_cardinal | subgraph_cardinal_dates + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/taggers/money.py b/nemo_text_processing/inverse_text_normalization/hy/taggers/money.py new file mode 100644 index 000000000..97b4d464c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/taggers/money.py @@ -0,0 +1,64 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hy.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_LOWER_CASED, + NEMO_SIGMA, + GraphFst, + convert_space, + delete_extra_space, +) + + +class MoneyFst(GraphFst): + """ + Finite state transducer for classifying money + e.g. քսան հազար դրամ -> tokens { money { integer_part: "20000" currency: "֏" } } + + Args: + cardinal: CardinalFst + decimal: DecimalFst + input_case: accepting either "lower_cased" or "cased" input. + (input_case is not necessary everything is made for lower_cased input) + TODO add cased input support + """ + + def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPUT_LOWER_CASED): + super().__init__(name="money", kind="classify") + # quantity, integer_part, fractional_part, currency + + cardinal_graph = cardinal.graph_no_exception + graph_decimal_final = decimal.final_graph_wo_negative + unit = pynini.string_file(get_abs_path("data/currency.tsv")) + unit_singular = pynini.invert(unit) + + graph_unit_singular = pynutil.insert("currency: \"") + convert_space(unit_singular) + pynutil.insert("\"") + + graph_integer = ( + pynutil.insert("integer_part: \"") + + (NEMO_SIGMA @ cardinal_graph) + + pynutil.insert("\"") + + delete_extra_space + + graph_unit_singular + ) + graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_singular + final_graph = graph_integer | graph_decimal + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/hy/taggers/ordinal.py new file mode 100644 index 000000000..381480270 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/taggers/ordinal.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hy.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, + NEMO_CHAR, + GraphFst, + capitalized_input_graph, +) + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for classifying ordinal + e.g. հիսունյոթերորդ -> tokens { ordinal { integer: "57" } } + + Args: + cardinal: CardinalFst + input_case: accepting either "lower_cased" or "cased" input. + (input_case is not necessary everything is made for lower_cased input) + TODO add cased input support + """ + + def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): + super().__init__(name="ordinal", kind="classify") + + cardinal_graph = cardinal.graph_no_exception + graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) + graph = pynini.closure(NEMO_CHAR) + pynini.union(graph_digit, pynini.cross("երորդ", "")) + + self.graph = pynini.compose(graph, cardinal_graph).optimize() + + if input_case == INPUT_CASED: + self.graph = capitalized_input_graph(self.graph) + + final_graph = pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/hy/taggers/punctuation.py new file mode 100644 index 000000000..a6c78a7c1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/taggers/punctuation.py @@ -0,0 +1,36 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + + +class PunctuationFst(GraphFst): + """ + Finite state transducer for classifying punctuation + e.g. , -> tokens { name: "," } + """ + + def __init__(self): + super().__init__(name="punctuation", kind="classify") + + s = "!#$%&\'()*+,-./:;<=>?@^_`{|}~,։՜՝" + punct = pynini.union(*s) + + graph = pynutil.insert("name: \"") + punct + pynutil.insert("\"") + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/taggers/time.py b/nemo_text_processing/inverse_text_normalization/hy/taggers/time.py new file mode 100644 index 000000000..1608cb6a0 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/taggers/time.py @@ -0,0 +1,76 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hy.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_space + + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time + e.g. տասներկուսն անց հինգ -> time { hours: "12" minutes: "05" } + e.g. հինգին տասնհինգ պակաս -> time { hours: "04" minutes: "45" } + e.g. տասներեք անց կես -> time { hours: "12" minutes: "30" } + """ + + def __init__(self): + super().__init__(name="time", kind="classify") + graph_oclock = pynutil.delete("անց") + + graph_demi = pynini.cross("կես", "30") + + graph_fractions = graph_demi + + graph_hours = pynini.string_file(get_abs_path("data/time/hours.tsv")) + ( + pynini.closure(pynutil.delete("ն") | pynutil.delete("ին"), 0, 1) + ) + graph_minutes = pynini.string_file(get_abs_path("data/time/minutes.tsv")) + ( + pynini.closure(pynutil.delete("ն") | pynutil.delete("ին"), 0, 1) + ) + graph_hours_to = pynini.string_file(get_abs_path("data/time/to_hour.tsv")) + graph_minutes_to = pynini.string_file(get_abs_path("data/time/minutes_to.tsv")) + graph_to = pynutil.delete("պակաս") + + graph_hours_component = pynutil.insert("hours: \"") + graph_hours + pynutil.insert("\"") + + graph_minutes_component = ( + pynutil.insert(" minutes: \"") + pynini.union(graph_minutes, graph_fractions) + pynutil.insert("\"") + ) + graph_minutes_component = delete_space + graph_minutes_component + + graph_time_standard = ( + graph_hours_component + delete_space + graph_oclock + pynini.closure(graph_minutes_component, 0, 1) + ) + + graph_hours_to_component = graph_hours + pynutil.delete('ին') + graph_hours_to_component @= graph_hours_to + graph_hours_to_component = pynutil.insert("hours: \"") + graph_hours_to_component + pynutil.insert("\"") + + graph_minutes_to_component = graph_minutes + graph_minutes_to_component @= graph_minutes_to + graph_minutes_to_component = pynutil.insert(" minutes: \"") + graph_minutes_to_component + pynutil.insert("\"") + + graph_time_to = graph_hours_to_component + delete_space + graph_minutes_to_component + delete_space + graph_to + + graph_time_no_suffix = graph_time_standard | graph_time_to + + final_graph = graph_time_no_suffix + + final_graph = self.add_tokens(final_graph) + + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hy/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..2f778e77c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/taggers/tokenize_and_classify.py @@ -0,0 +1,114 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hy.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.hy.taggers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.hy.taggers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.hy.taggers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.hy.taggers.money import MoneyFst +from nemo_text_processing.inverse_text_normalization.hy.taggers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.hy.taggers.punctuation import PunctuationFst +from nemo_text_processing.inverse_text_normalization.hy.taggers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.hy.taggers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.hy.taggers.word import WordFst +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_LOWER_CASED, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.utils.logging import logger + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__( + self, + cache_dir: str = None, + whitelist: str = None, + overwrite_cache: bool = False, + input_case: str = INPUT_LOWER_CASED, + ): + super().__init__(name="tokenize_and_classify", kind="classify") + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"_hy_itn_{input_case}.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logger.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logger.info(f"Creating ClassifyFst grammars.") + + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + ordinal = OrdinalFst(cardinal) + ordinal_graph = ordinal.fst + + fraction = FractionFst(cardinal, ordinal) + fraction_graph = fraction.fst + + decimal = DecimalFst(cardinal) + decimal_graph = decimal.fst + + measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst + word_graph = WordFst().fst + time_graph = TimeFst().fst + money_graph = MoneyFst(cardinal, decimal).fst + punct_graph = PunctuationFst().fst + whitelist_graph = WhiteListFst().fst + + classify = ( + pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(time_graph, 1.05) + | pynutil.add_weight(decimal_graph, 1.08) + | pynutil.add_weight(measure_graph, 1.1) + | pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(ordinal_graph, 1.1) + | pynutil.add_weight(fraction_graph, 1.09) + | pynutil.add_weight(money_graph, 1.07) + | pynutil.add_weight(word_graph, 100) + ) + + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + token_plus_punct = ( + pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + ) + + graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) + graph = delete_space + graph + delete_space + + self.fst = graph.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) diff --git a/nemo_text_processing/inverse_text_normalization/hy/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/hy/taggers/whitelist.py new file mode 100644 index 000000000..212fe80f2 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/taggers/whitelist.py @@ -0,0 +1,36 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hy.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for classifying whitelisted tokens + e.g. մեր թվարկությունից առաջ -> tokens { name: "մ.թ.ա" } + This class has highest priority among all classifier grammars. + Whitelisted tokens are defined and loaded from "data/whitelist.tsv" (unless input_file specified). + """ + + def __init__(self): + super().__init__(name="whitelist", kind="classify") + + whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert() + graph = pynutil.insert("name: \"") + convert_space(whitelist) + pynutil.insert("\"") + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/taggers/word.py b/nemo_text_processing/inverse_text_normalization/hy/taggers/word.py new file mode 100644 index 000000000..74a356072 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/taggers/word.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class. + e.g. արթնանալ -> tokens { name: "արթնանալ" } + """ + + def __init__(self): + super().__init__(name="word", kind="classify") + word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/utils.py b/nemo_text_processing/inverse_text_normalization/hy/utils.py new file mode 100644 index 000000000..f7179e35b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/utils.py @@ -0,0 +1,60 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import os + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + +def load_labels(abs_path): + """ + loads relative path file as dictionary + + Args: + abs_path: absolute path + + Returns dictionary of mappings + """ + with open(abs_path, encoding="utf-8") as label_tsv: + labels = list(csv.reader(label_tsv, delimiter="\t")) + return labels + + +def augment_labels_with_punct_at_end(labels): + """ + augments labels: if key ends on a punctuation that value does not have, add a new label + where the value maintains the punctuation + + Args: + labels : input labels + Returns: + additional labels + """ + res = [] + for label in labels: + if len(label) > 1: + if label[0][-1] == "." and label[1][-1] != ".": + res.append([label[0], label[1] + "."] + label[2:]) + return res diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/cardinal.py new file mode 100644 index 000000000..d80e41287 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/cardinal.py @@ -0,0 +1,42 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, delete_space + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal + e.g. cardinal { integer: "97" } -> 97 + """ + + def __init__(self): + super().__init__(name="cardinal", kind="verbalize") + + graph = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete("\"") + ) + + self.numbers = graph + delete_tokens = self.delete_tokens(graph) + + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/decimal.py new file mode 100644 index 000000000..b5caee6a8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/decimal.py @@ -0,0 +1,58 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space + + +class DecimalFst(GraphFst): + """ + Finite state transducer for verbalizing decimal, e.g. + decimal { integer_part: "12" fractional_part: "5" quantity: "միլիարդ" } -> 12.5 միլիարդ + """ + + def __init__(self): + super().__init__(name="decimal", kind="verbalize") + integer = ( + pynutil.delete("integer_part:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + optional_integer = pynini.closure(integer + delete_space, 0, 1) + fractional = ( + pynutil.insert(".") + + pynutil.delete("fractional_part:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + optional_fractional = pynini.closure(fractional + delete_space, 0, 1) + quantity = ( + pynutil.delete("quantity:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + optional_quantity = pynini.closure(pynutil.insert(NEMO_SPACE) + quantity + delete_space, 0, 1) + graph = optional_integer + optional_fractional + optional_quantity + self.numbers = graph + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/fraction.py new file mode 100644 index 000000000..fb96f95d5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/fraction.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class FractionFst(GraphFst): + """ + Finite state transducer for verbalizing fraction + e.g. tokens { fraction { numerator: "2" denominator: "3" } } -> 2/3 + + """ + + def __init__(self): + super().__init__(name="fraction", kind="verbalize") + numerator = pynutil.delete("numerator: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + + denominator = ( + pynutil.insert('/') + + pynutil.delete("denominator: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + graph = (numerator + delete_space + denominator).optimize() + self.numbers = graph + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/measure.py new file mode 100644 index 000000000..68401691f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/measure.py @@ -0,0 +1,63 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SPACE, GraphFst, delete_space + + +class MeasureFst(GraphFst): + """ + Finite state transducer for verbalizing measure, e.g. + measure { cardinal { integer: "59" } units: "Հց" } -> 59 Հց + + Args: + decimal: DecimalFst + cardinal: CardinalFst + """ + + def __init__(self, decimal: GraphFst, cardinal: GraphFst): + super().__init__(name="measure", kind="verbalize") + unit = ( + pynutil.delete("units:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_CHAR - NEMO_SPACE, 1) + + pynutil.delete("\"") + + delete_space + ) + graph_decimal = ( + pynutil.delete("decimal {") + delete_space + decimal.numbers + delete_space + pynutil.delete("}") + ) + graph_cardinal_first = ( + pynutil.delete("cardinal {") + delete_space + cardinal.numbers + delete_space + pynutil.delete("} ") + ) + + graph_cardinal_two = ( + pynutil.delete("cardinal {") + + pynutil.delete(" integer: \"") + + delete_space + + pynini.closure(NEMO_CHAR - NEMO_SPACE, 1) + + pynutil.delete("\"") + + delete_space + + pynutil.delete("} ") + ) + + graph_first = (graph_cardinal_first | graph_decimal) + delete_space + pynutil.insert(" ") + unit + graph_second = graph_cardinal_two + delete_space + pynutil.insert(" ") + unit + graph = graph_first | graph_second + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/money.py new file mode 100644 index 000000000..3af78c955 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/money.py @@ -0,0 +1,48 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_CHAR, + NEMO_SPACE, + GraphFst, + delete_space, + insert_space, +) + + +class MoneyFst(GraphFst): + """ + Finite state transducer for verbalizing money, e.g. + money { integer_part: "20000" currency: "֏" } -> 20000 ֏ + + Args: + decimal: DecimalFst + """ + + def __init__(self, decimal: GraphFst): + super().__init__(name="money", kind="verbalize") + unit = ( + pynutil.delete("currency:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_CHAR - NEMO_SPACE, 1) + + pynutil.delete("\"") + ) + graph = decimal.numbers + delete_space + insert_space + unit + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py new file mode 100644 index 000000000..e912ff60b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_space + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for verbalizing ordinal, e.g. + tokens { ordinal { integer: "3" } } -> 3-րդ + """ + + def __init__(self): + super().__init__(name="ordinal", kind="verbalize") + graph = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + convert_one = pynini.cross("[BOS]1", "[BOS]1-ին") + convert_rest = pynutil.insert("-րդ", weight=0.01) + + suffix = pynini.cdrewrite(convert_rest | convert_one, "", "[EOS]", NEMO_SIGMA,) + graph = graph @ suffix + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/time.py new file mode 100644 index 000000000..7edbc043a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/time.py @@ -0,0 +1,53 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, delete_space + + +class TimeFst(GraphFst): + """ + Finite state transducer for verbalizing time, e.g. + tokens { time { hours: "4" minutes: "45" } } -> 04:45 + tokens { time { hours: "1" minutes: "12" } } -> 01:12 + """ + + def __init__(self): + super().__init__(name="time", kind="verbalize") + add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) + hour = ( + pynutil.delete("hours:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete("\"") + ) + minute = ( + pynutil.delete("minutes:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete("\"") + ) + graph = ( + hour @ add_leading_zero_to_double_digit + + delete_space + + pynutil.insert(":") + + (minute @ add_leading_zero_to_double_digit) + ) + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/verbalize.py new file mode 100644 index 000000000..bf3635a40 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/verbalize.py @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.hy.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.hy.verbalizers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.hy.verbalizers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.hy.verbalizers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.hy.verbalizers.money import MoneyFst +from nemo_text_processing.inverse_text_normalization.hy.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.hy.verbalizers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.hy.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + + +class VerbalizeFst(GraphFst): + def __init__(self): + super().__init__(name="verbalize", kind="verbalize") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + ordinal_graph = OrdinalFst().fst + decimal = DecimalFst() + decimal_graph = decimal.fst + fraction = FractionFst() + fraction_graph = fraction.fst + measure_graph = MeasureFst(decimal=decimal, cardinal=cardinal).fst + money_graph = MoneyFst(decimal=decimal).fst + time_graph = TimeFst().fst + whitelist_graph = WhiteListFst().fst + graph = ( + time_graph + | measure_graph + | fraction_graph + | money_graph + | ordinal_graph + | decimal_graph + | cardinal_graph + | whitelist_graph + ) + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/verbalize_final.py new file mode 100644 index 000000000..39eb3a15f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/verbalize_final.py @@ -0,0 +1,45 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hy.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.hy.verbalizers.word import WordFst +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "նա" } tokens { cardinal { integer: "23" } } tokens { name: "տարեկան" } tokens { name: "է" } -> նա 23 տարեկան է + """ + + def __init__(self): + super().__init__(name="verbalize_final", kind="verbalize") + verbalize = VerbalizeFst().fst + word = WordFst().fst + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/whitelist.py new file mode 100644 index 000000000..bdfb84dea --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/whitelist.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for verbalizing whitelist + e.g. tokens { name: "մ.թ.ա" } -> մ.թ.ա + """ + + def __init__(self): + super().__init__(name="whitelist", kind="verbalize") + graph = ( + pynutil.delete("name:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_CHAR - " ", 1) + + pynutil.delete("\"") + ) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/word.py new file mode 100644 index 000000000..b846cc4b9 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/word.py @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + + +class WordFst(GraphFst): + """ + Finite state transducer for verbalizing plain tokens + e.g. tokens { name: "արթնանալ" } -> արթնանալ + """ + + def __init__(self): + super().__init__(name="word", kind="verbalize") + chars = pynini.closure(NEMO_CHAR - " ", 1) + char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index 72c7f43e1..b13fe2c65 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -111,6 +111,11 @@ def __init__( from nemo_text_processing.inverse_text_normalization.zh.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) + elif lang == 'hy': + from nemo_text_processing.inverse_text_normalization.hy.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.inverse_text_normalization.hy.verbalizers.verbalize_final import ( + VerbalizeFinalFst, + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -155,7 +160,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hy'], default="en", type=str, ) diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 4cc178ce1..29fc935cf 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -33,7 +33,7 @@ def parse_args(): parser = ArgumentParser() parser.add_argument("--input", help="input file path", type=str) parser.add_argument( - "--lang", help="language", choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'vi'], default="en", type=str + "--lang", help="language", choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'vi', 'hy'], default="en", type=str ) parser.add_argument( "--cat", diff --git a/tests/nemo_text_processing/hy/__init__.py b/tests/nemo_text_processing/hy/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/tests/nemo_text_processing/hy/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..42c015b0a --- /dev/null +++ b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,23 @@ +հարյուր~100 +տասներեք~13 +տասներկու~12 +քսանութ~28 +տասնհինգ հարյուր~15 100 +հիսունհինգ հազար~55000 +երկու միլիոն հարյուր իննսունինը~2000199 +մեկ միլիոն հարյուր հիսունվեց հազար հարյուր յոթանասուներեք~1156173 +մեկ միլիարդ հինգ հարյուր իննսուներեք միլիոն յոթանասուներկու հազար ինը հարյուր վաթսունմեկ~1593072961 +քսանհինգ հազար իննսունհինգ~25095 +քառասունվեց հազար վեց հարյուր~46600 +վաթսուն~60 +հազար~1000 +հարյուր~100 +հարյուր քսան~120 +հազար հարյուր հիսունյոթ~1157 +տասնութ միլիոն ինը հարյուր քառասուն հազար յոթ հարյուր քսաներկու~18940722 +տասնութ միլիոն վեց հարյուր իննսուն հազար ինը հարյուր տասնվեց~18690916 +տասնութ հազար ութ հարյուր ութսուն~18880 +հազար հարյուր~1100 +երկու հազար հարյուր տասնմեկ~2111 +վաթսուն~60 +քառասունվեց հազար վեց հարյուր վաթսունչորս~46664 diff --git a/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_decimal.txt new file mode 100644 index 000000000..ac7a23cfd --- /dev/null +++ b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_decimal.txt @@ -0,0 +1,14 @@ +զրո ամբողջ մեկ~0.1 +զրո ամբողջ յոթ~0.7 +մեկ ամբողջ յոթ միլիոնի պարտք պետությանը~1.7 միլիոնի պարտք պետությանը +մեկ ամբողջ երեք~1.3 +մեկ ամբողջ երկու~1.2 +այդ ամենը տևեց հինգ ու կես տարի~այդ ամենը տևեց 5.5 տարի +զրո ամբողջ ինը~0.9 +երկու ամբողջ երեք~2.3 +երկու ամբողջ հինգ~2.5 +մեկ ամբողջ երկու միլիարդ~1.2 միլիարդ +այստեղ ապրում է մոտ երեք ամբողջ յոթ միլիոն մարդ~այստեղ ապրում է մոտ 3.7 միլիոն մարդ +հինգ ու կես միլիոն~5.5 միլիոն +հարյուր իննսունյոթ ու կես միլիոն~197.5 միլիոն +ութ հարյուր տրիլիոն~800 տրիլիոն diff --git a/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_fraction.txt new file mode 100644 index 000000000..e31640d03 --- /dev/null +++ b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_fraction.txt @@ -0,0 +1,24 @@ +մեկ տասնմեկերորդ~1/11 +մեկ տասներկուերորդ~1/12 +մեկ տասներեքերորդ~1/13 +մեկ քառորդ~1/4 +մեկ չորրորդ~1/4 +օվկիանոսները կազմում են երկրագնդմի մակերեսի յոթանասունմեկ հարյուրերորդ մասը~օվկիանոսները կազմում են երկրագնդմի մակերեսի 71/100 մասը +մեկ հիսունհինգերորդ~1/55 +հինգ քսաներորդ~5/20 +պիցցայի մեկ երկրորդ մասը~պիցցայի 1/2 մասը +մեկ հիսուներորդ~1/50 +իննսունինը վաթսուներորդ~99/60 +մեկ յոթանասուներորդ~1/70 +որոշ բույսի չորս հինգերորդ մասը կազմված են ջրից~որոշ բույսի 4/5 մասը կազմված են ջրից +մեկ ութսուներորդ~1/80 +մեկ իննսուներորդ~1/90 +մեկ հարյուրերորդ~1/100 +երկու հինգերորդ~2/5 +տասն երեսուներորդ~10/30 +երեք քսաներորդ~3/20 +տասներկու հարյուրերորդ~12/100 +հիսունհինգ հարյուրերորդ~55/100 +հիսուն հինգ հարյուրերորդ~50/500 +մեկ տասներորդ~1/10 +մեկ քառասուներորդ~1/40 diff --git a/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_measure.txt new file mode 100644 index 000000000..93ab72d08 --- /dev/null +++ b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_measure.txt @@ -0,0 +1,12 @@ +երկու հարյուր մետր~200 մ +հիսունվեց ամբողջ երեք քառակուսի կիլոմետր~56.3 կմ² +երկու հարյուր կիլոմետր ժամում~200 կմ/ժ +երկու հարյուր կիլոմետր ժամում~200 կմ/ժ +վաթսունվեց կիլոգրամ~66 կգ +հիսուն րոպե~50 ր +երկու հարյուր ձիաուժ~200 ձ.ու. +իննսուն գրամ~90 գ +երեք հարյուր սանտիմետր~300 սմ +վաթսունհինգ քառակուսի կիլոմետր~65 կմ² +հիսուն հերց~50 Հց +ինը հարյուր գեգաբայթ~900 գբ diff --git a/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_money.txt new file mode 100644 index 000000000..757543476 --- /dev/null +++ b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_money.txt @@ -0,0 +1,10 @@ +մեկ դոլար~1 $ +մեկ ռուբլի~1 ₽ +երեսունմեկ ռուբլի~31 ₽ +տասնվեց հազար ամն դոլար~16000 $ +տասնութ հազար դոլար~18000 $ +հիսունհինգ հազար դրամ~55000 ֏ +երկուս ու կես միլիոն տենգե~2.5 միլիոն ₸ +երեք ամբողջ երկու միլիարդ լարի~3.2 միլիարդ ₾ +իննսուն հազար քսանմեկ զլոտի~90021 zł +երեք բիթքոին~3 ₿ diff --git a/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_ordinal.txt new file mode 100644 index 000000000..34831ee2e --- /dev/null +++ b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_ordinal.txt @@ -0,0 +1,11 @@ +տասնմեկերորդ~11-րդ +տասներկուերորդ~12-րդ +տասներեքերորդ~13-րդ +քսանմեկերորդ~21-րդ +քսաներեքերորդ~23-րդ +հարյուր տասնմեկերորդ~111-րդ +հարյուր միլիոն քսանհինգերորդ~100000025-րդ +հազարերորդ~1000-րդ +հարյուր քսանմեկերորդ~121-րդ +երկրորդ~2-րդ +առաջին~1-ին diff --git a/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..9a9c4e14a --- /dev/null +++ b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_time.txt @@ -0,0 +1,9 @@ +ութ անց հիսունհինգ~08:55 +երեքին հինգ պակաս~02:55 +հինգ անց կես~05:30 +քսաներեքն անց հիսունհինգ~23:55 +մեկ անց կես~01:30 +տասնմեկին տասնհինգ պակաս~10:45 +վեցին տասը պակաս~05:50 +երկուս անց քսաներկու~02:22 +յոթ անց կես~07:30 diff --git a/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..4cd226d88 --- /dev/null +++ b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,7 @@ +սույն թվականի~ս.թ. +մեր թվարկությունից առաջ~մ.թ.ա. +մեր թվարկություն~մ.թ. +քրիստոսից առաջ~Ք.ա. +քրիստոսից հետո~Ք.հ. +քրիստոսի ծնունդից առաջ~Ք.ծ.ա. +քրիստոսից ծնունդից հետո~Ք.ծ.հ. \ No newline at end of file diff --git a/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_word.txt new file mode 100644 index 000000000..304118a4d --- /dev/null +++ b/tests/nemo_text_processing/hy/data_inverse_text_normalization/test_cases_word.txt @@ -0,0 +1,50 @@ +~ +yahoo!~yahoo! +քսան !~20 ! +սսսս հիսունհինգ~սսսս 55 +x~x +—~— +aaa~aaa +aabach~aabach +aabenraa~aabenraa +aabye~aabye +aaccessed~aaccessed +aach~aach +aachen's~aachen's +aadri~aadri +aafia~aafia +aagaard~aagaard +aagadu~aagadu +aagard~aagard +aagathadi~aagathadi +aaghart's~aaghart's +aagnes~aagnes +aagomoni~aagomoni +aagon~aagon +aagoo~aagoo +aagot~aagot +aahar~aahar +aahh~aahh +aahperd~aahperd +aaibinterstate~aaibinterstate +aajab~aajab +aakasa~aakasa +aakervik~aakervik +aakirkeby~aakirkeby +aalam~aalam +aalbaek~aalbaek +aaldiu~aaldiu +aalem~aalem +a'ali~a'ali +aalilaassamthey~aalilaassamthey +aalin~aalin +aaliyan~aaliyan +aaliyan's~aaliyan's +aamadu~aamadu +aamara~aamara +aambala~aambala +aamera~aamera +aamer's~aamer's +aamina~aamina +aaminah~aaminah +aamjiwnaang~aamjiwnaang diff --git a/tests/nemo_text_processing/hy/test_cardinal.py b/tests/nemo_text_processing/hy/test_cardinal.py new file mode 100644 index 000000000..82a9d1f1d --- /dev/null +++ b/tests/nemo_text_processing/hy/test_cardinal.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=True) + + @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=True) + assert pred == expected diff --git a/tests/nemo_text_processing/hy/test_decimal.py b/tests/nemo_text_processing/hy/test_decimal.py new file mode 100644 index 000000000..051b60e21 --- /dev/null +++ b/tests/nemo_text_processing/hy/test_decimal.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDecimal: + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hy/test_fraction.py b/tests/nemo_text_processing/hy/test_fraction.py new file mode 100644 index 000000000..28157274e --- /dev/null +++ b/tests/nemo_text_processing/hy/test_fraction.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestFraction: + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=True) + + @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_fraction.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=True) + assert pred == expected diff --git a/tests/nemo_text_processing/hy/test_measure.py b/tests/nemo_text_processing/hy/test_measure.py new file mode 100644 index 000000000..ccf73cb63 --- /dev/null +++ b/tests/nemo_text_processing/hy/test_measure.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestMeasure: + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_measure.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hy/test_money.py b/tests/nemo_text_processing/hy/test_money.py new file mode 100644 index 000000000..8d4189091 --- /dev/null +++ b/tests/nemo_text_processing/hy/test_money.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestMoney: + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_money.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hy/test_ordinal.py b/tests/nemo_text_processing/hy/test_ordinal.py new file mode 100644 index 000000000..52026d486 --- /dev/null +++ b/tests/nemo_text_processing/hy/test_ordinal.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestOrdinal: + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=True) + + @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_ordinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=True) + assert pred == expected diff --git a/tests/nemo_text_processing/hy/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/hy/test_sparrowhawk_inverse_text_normalization.sh new file mode 100755 index 000000000..a7c2fb80e --- /dev/null +++ b/tests/nemo_text_processing/hy/test_sparrowhawk_inverse_text_normalization.sh @@ -0,0 +1,69 @@ +#! /bin/sh + +PROJECT_DIR=/workspace/tests + +runtest () { + input=$1 + cd /workspace/sparrowhawk/documentation/grammars + + # read test file + while read testcase; do + IFS='~' read spoken written <<< $testcase + denorm_pred=$(echo $spoken | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + + # trim white space + written="$(echo -e "${written}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$spoken" "$written" "$denorm_pred" + done < "$input" +} + +testITNCardinal() { + input=$PROJECT_DIR/hy/data_inverse_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testITNDecimal() { + input=$PROJECT_DIR/hy/data_inverse_text_normalization/test_cases_decimal.txt + runtest $input +} + +testITNOrdinal() { + input=$PROJECT_DIR/hy/data_inverse_text_normalization/test_cases_ordinal.txt + runtest $input +} + +testITNFraction() { + input=$PROJECT_DIR/hy/data_inverse_text_normalization/test_cases_fraction.txt + runtest $input +} + +testITNTime() { + input=$PROJECT_DIR/hy/data_inverse_text_normalization/test_cases_time.txt + runtest $input +} + +testITNMeasure() { + input=$PROJECT_DIR/hy/data_inverse_text_normalization/test_cases_measure.txt + runtest $input +} + +testITNMoney() { + input=$PROJECT_DIR/hy/data_inverse_text_normalization/test_cases_money.txt + runtest $input +} + +testITNWhitelist() { + input=$PROJECT_DIR/hy/data_inverse_text_normalization/test_cases_whitelist.txt + runtest $input +} + +testITNWord() { + input=$PROJECT_DIR/hy/data_inverse_text_normalization/test_cases_word.txt + runtest $input +} + +# Load shUnit2 +. $PROJECT_DIR/../shunit2/shunit2 diff --git a/tests/nemo_text_processing/hy/test_time.py b/tests/nemo_text_processing/hy/test_time.py new file mode 100644 index 000000000..7b9df98b5 --- /dev/null +++ b/tests/nemo_text_processing/hy/test_time.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTime: + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hy/test_whitelist.py b/tests/nemo_text_processing/hy/test_whitelist.py new file mode 100644 index 000000000..59b31483f --- /dev/null +++ b/tests/nemo_text_processing/hy/test_whitelist.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWhitelist: + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hy/test_word.py b/tests/nemo_text_processing/hy/test_word.py new file mode 100644 index 000000000..69a34456d --- /dev/null +++ b/tests/nemo_text_processing/hy/test_word.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWord: + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 58eb6a706..5db4531a5 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -80,7 +80,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=["en", "de", "es", "pt", "ru", 'fr', 'hu', 'sv', 'vi', 'zh', 'ar', 'it', 'es_en'], + choices=["en", "de", "es", "pt", "ru", 'fr', 'hu', 'sv', 'vi', 'zh', 'ar', 'it', 'es_en', 'hy'], type=str, default='en', ) @@ -228,6 +228,13 @@ def parse_args(): from nemo_text_processing.inverse_text_normalization.es_en.verbalizers.verbalize import ( VerbalizeFst as ITNVerbalizeFst, ) + elif args.language == 'hy': + from nemo_text_processing.inverse_text_normalization.hy.taggers.tokenize_and_classify import ( + ClassifyFst as ITNClassifyFst, + ) + from nemo_text_processing.inverse_text_normalization.hy.verbalizers.verbalize import ( + VerbalizeFst as ITNVerbalizeFst, + ) output_dir = os.path.join(args.output_dir, args.language) export_grammars( output_dir=output_dir, From 7bc3654801119310d73dd8a1e1ca60ff0e7ecf37 Mon Sep 17 00:00:00 2001 From: Evelina <10428420+ekmb@users.noreply.github.com> Date: Thu, 29 Feb 2024 08:43:12 -0800 Subject: [PATCH 09/90] Fix CI (#142) * fix whitelist deployment Signed-off-by: Evelina * clean up Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * comment out tests to recreate grammars Signed-off-by: Evelina * shorten test Signed-off-by: Evelina * fix jenkins Signed-off-by: Evelina * cased for TN Signed-off-by: Evelina * revert debug changes Signed-off-by: Evelina * fix args default Signed-off-by: Evelina * try parallel Signed-off-by: Evelina * debug parallel Signed-off-by: Evelina * rerun Signed-off-by: Evelina * rerun Signed-off-by: Evelina * fix sh tests for local SH launcher Signed-off-by: Evelina * enable all ci tests Signed-off-by: Evelina * enable all ci tests Signed-off-by: Evelina --------- Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui --- Jenkinsfile | 115 ++++-------------- .../text_normalization/normalize.py | 6 + ..._sparrowhawk_inverse_text_normalization.sh | 6 +- .../de/test_sparrowhawk_normalization.sh | 6 +- .../test_cases_money.txt | 2 +- tests/nemo_text_processing/en/test_money.py | 4 +- ..._sparrowhawk_inverse_text_normalization.sh | 5 +- ...owhawk_inverse_text_normalization_cased.sh | 8 +- .../en/test_sparrowhawk_normalization.sh | 6 +- ..._sparrowhawk_inverse_text_normalization.sh | 6 +- .../es/test_sparrowhawk_normalization.sh | 6 +- ..._sparrowhawk_inverse_text_normalization.sh | 6 +- ..._sparrowhawk_inverse_text_normalization.sh | 10 +- .../fr/test_sparrowhawk_normalization.sh | 6 +- .../hu/test_sparrowhawk_normalization.sh | 6 +- ..._sparrowhawk_inverse_text_normalization.sh | 6 +- .../it/test_sparrowhawk_normalization.sh | 6 +- ..._sparrowhawk_inverse_text_normalization.sh | 6 +- ..._sparrowhawk_inverse_text_normalization.sh | 6 +- ..._sparrowhawk_inverse_text_normalization.sh | 6 +- .../sv/test_sparrowhawk_normalization.sh | 6 +- ..._sparrowhawk_inverse_text_normalization.sh | 6 +- ..._sparrowhawk_inverse_text_normalization.sh | 6 +- .../zh/test_sparrowhawk_normalization.sh | 6 +- .../docker/launch.sh | 12 +- .../export_grammars.sh | 34 +++--- .../pynini_export.py | 2 +- tools/text_processing_deployment/sh_test.sh | 19 ++- 28 files changed, 158 insertions(+), 161 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index d1b4062e4..0bc046399 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -90,7 +90,7 @@ pipeline { } } - stage('L0: Create DE TN/ITN Grammars') { + stage('L0: Create DE/ES TN/ITN Grammars') { when { anyOf { branch 'main' @@ -109,19 +109,6 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=de --text="ein hundert " --cache_dir ${DEFAULT_TN_CACHE}' } } - - } - } - - stage('L0: Create ES TN/ITN Grammars') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { stage('L0: ES TN grammars') { steps { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=es --text="1" --cache_dir ${ES_TN_CACHE}' @@ -132,26 +119,11 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=es --text="ciento uno " --cache_dir ${ES_TN_CACHE}' } } - - } - } - - stage('L0: Create Codeswitched ES/EN TN/ITN Grammars') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - - stage('L0: ES/EN ITN grammars') { + stage('L0: Codeswitched ES/EN ITN grammars') { steps { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=es_en --text="ciento uno " --cache_dir ${ES_EN_TN_CACHE}' } } - } } @@ -178,7 +150,7 @@ pipeline { } } - stage('L0: Create FR TN/ITN Grammars') { + stage('L0: Create FR TN/ITN & VI ITN & HU TN & IT TN') { when { anyOf { branch 'main' @@ -197,18 +169,6 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=fr --text="cent " --cache_dir ${FR_TN_CACHE}' } } - - } - } - stage('L0: Create VI ITN & HU TN & IT TN') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { stage('L0: VI ITN grammars') { steps { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=vi --text="một ngàn " --cache_dir ${VI_TN_CACHE}' @@ -227,29 +187,7 @@ pipeline { } } - stage('L0: Create PT TN/ITN Grammars') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - // stage('L0: PT TN grammars') { - // steps { - // sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=pt --text="2" --cache_dir ${DEFAULT_TN_CACHE}' - // } - // } - stage('L0: PT ITN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=pt --text="dez " --cache_dir ${PT_TN_CACHE}' - } - } - - } - } - stage('L0: Create RU TN/ITN Grammars') { + stage('L0: Create RU TN/ITN Grammars & SV & PT & ZH') { when { anyOf { branch 'main' @@ -268,17 +206,6 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ru --text="три " --cache_dir ${RU_TN_CACHE}' } } - } - } - stage('L0: Create SV TN/ITN Grammars') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { stage('L0: SV TN grammars') { steps { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=sv --text="100" --cache_dir ${SV_TN_CACHE}' @@ -289,17 +216,16 @@ pipeline { // sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=sv --text="hundra " --cache_dir ${SV_TN_CACHE}' // } // } - } - } - stage('L0: Create ZH TN/ITN Grammars') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' + // stage('L0: PT TN grammars') { + // steps { + // sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=pt --text="2" --cache_dir ${DEFAULT_TN_CACHE}' + // } + // } + stage('L0: PT ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=pt --text="dez " --cache_dir ${PT_TN_CACHE}' + } } - } - failFast true - parallel { stage('L0: ZH TN grammars') { steps { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=zh --text="你" --cache_dir ${ZH_TN_CACHE}' @@ -313,6 +239,7 @@ pipeline { } } + // L1 Tests starts here stage('L1: TN/ITN Tests CPU') { when { @@ -389,25 +316,25 @@ pipeline { } } failFast true - stages { + parallel { stage('L2: EN ITN Run Sparrowhawk test - Lower Cased Input') { steps { - sh 'CUDA_VISIBLE_DEVICES="" cd tools/text_processing_deployment && bash sh_test.sh --MODE="test_itn_grammars" --OVERWRITE_CACHE=False --FAR_PATH=${EN_TN_CACHE}/SH_ITN --LANGUAGE="en"' - sh 'CUDA_VISIBLE_DEVICES="" cd tests/nemo_text_processing/en && bash test_sparrowhawk_inverse_text_normalization.sh `pwd`' + sh 'CUDA_VISIBLE_DEVICES="" cp -r /workspace/sparrowhawk/documentation/grammars /workspace/sparrowhawk/documentation/grammars_en_itn_grammars_lower_cased && cd tools/text_processing_deployment && bash sh_test.sh --MODE="test_itn_grammars" --OVERWRITE_CACHE=False --FAR_PATH=${EN_TN_CACHE}/SH_ITN --LANGUAGE="en"' + sh 'CUDA_VISIBLE_DEVICES="" cd tests/nemo_text_processing/en && bash test_sparrowhawk_inverse_text_normalization.sh /workspace/sparrowhawk/documentation/grammars_en_itn_grammars_lower_cased `pwd`' } } stage('L2: EN ITN Run Sparrowhawk test - Cased Input') { steps { - sh 'CUDA_VISIBLE_DEVICES="" cd tools/text_processing_deployment && bash sh_test.sh --MODE="test_itn_grammars" --INPUT_CASE="cased" --OVERWRITE_CACHE=False --FAR_PATH=${EN_TN_CACHE}/SH_ITN_cased --LANGUAGE="en"' - sh 'CUDA_VISIBLE_DEVICES="" cd tests/nemo_text_processing/en && bash test_sparrowhawk_inverse_text_normalization_cased.sh `pwd`' + sh 'CUDA_VISIBLE_DEVICES="" cp -r /workspace/sparrowhawk/documentation/grammars /workspace/sparrowhawk/documentation/grammars_en_itn_grammars_cased && cd tools/text_processing_deployment && bash sh_test.sh --MODE="test_itn_grammars" --INPUT_CASE="cased" --OVERWRITE_CACHE=False --FAR_PATH=${EN_TN_CACHE}/SH_ITN_cased --LANGUAGE="en"' + sh 'CUDA_VISIBLE_DEVICES="" cd tests/nemo_text_processing/en && bash test_sparrowhawk_inverse_text_normalization_cased.sh /workspace/sparrowhawk/documentation/grammars_en_itn_grammars_cased `pwd`' } } stage('L2: EN TN Run Sparrowhawk test') { steps { - sh 'CUDA_VISIBLE_DEVICES="" cd tools/text_processing_deployment && bash sh_test.sh --MODE="test_tn_grammars" --OVERWRITE_CACHE=False --FAR_PATH=${EN_TN_CACHE}/SH_TN --GRAMMARS="tn_grammars" --LANGUAGE="en" ' - sh 'CUDA_VISIBLE_DEVICES="" cd tests/nemo_text_processing/en && bash test_sparrowhawk_normalization.sh `pwd`' + sh 'CUDA_VISIBLE_DEVICES="" cp -r /workspace/sparrowhawk/documentation/grammars /workspace/sparrowhawk/documentation/grammars_en_tn_grammars_cased && cd tools/text_processing_deployment && bash sh_test.sh --MODE="test_tn_grammars" --INPUT_CASE="cased" --OVERWRITE_CACHE=False --FAR_PATH=${EN_TN_CACHE}/SH_TN --GRAMMARS="tn_grammars" --LANGUAGE="en" ' + sh 'CUDA_VISIBLE_DEVICES="" cd tests/nemo_text_processing/en && bash test_sparrowhawk_normalization.sh /workspace/sparrowhawk/documentation/grammars_en_tn_grammars_cased `pwd`' } } diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index ad284e871..5093eceef 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -721,6 +721,11 @@ def parse_args(): type=str, ) parser.add_argument("--verbose", help="print info for debugging", action='store_true') + parser.add_argument( + "--no_post_process", + help="WFST-based post processing, e.g. to remove extra spaces added during TN, normalize punctuation marks [could differ from the input]. Only Eng is supported, not supported in Sparrowhawk", + action="store_true", + ) parser.add_argument( "--punct_post_process", help="Add this flag to enable punctuation post processing to match input.", @@ -765,6 +770,7 @@ def parse_args(): normalizer = Normalizer( input_case=args.input_case, + post_process=not args.no_post_process, cache_dir=args.cache_dir, overwrite_cache=args.overwrite_cache, whitelist=whitelist, diff --git a/tests/nemo_text_processing/de/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/de/test_sparrowhawk_inverse_text_normalization.sh index d7aba4c02..634603c49 100644 --- a/tests/nemo_text_processing/de/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/de/test_sparrowhawk_inverse_text_normalization.sh @@ -2,9 +2,13 @@ PROJECT_DIR=/workspace/tests +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/en"} + runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/de/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/de/test_sparrowhawk_normalization.sh index 3dfdfe9f3..4f38d8c47 100644 --- a/tests/nemo_text_processing/de/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/de/test_sparrowhawk_normalization.sh @@ -1,10 +1,12 @@ #! /bin/sh -PROJECT_DIR=/workspace/tests +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_money.txt index 6fcef0ea5..e2c828f42 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_money.txt @@ -63,4 +63,4 @@ $1,925.21~one thousand nine hundred and twenty five dollars twenty one cents $1,234.123~one thousand two hundred and thirty four point one two three dollars US $76.3 trillion~US seventy six point three trillion dollars US$76.3 trillion~seventy six point three trillion us dollars -The price for each canned salmon is $5, each bottle of peanut butter is $3~The price for each canned salmon is five dollars, each bottle of peanut butter is three dollars +The price for each canned salmon is $5 , each bottle of peanut butter is $3~The price for each canned salmon is five dollars , each bottle of peanut butter is three dollars diff --git a/tests/nemo_text_processing/en/test_money.py b/tests/nemo_text_processing/en/test_money.py index 9f1387c51..c81945ecd 100644 --- a/tests/nemo_text_processing/en/test_money.py +++ b/tests/nemo_text_processing/en/test_money.py @@ -45,7 +45,9 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_en_cased.inverse_normalize(test_input, verbose=False) assert pred == expected, f"input: {test_input}" - normalizer_en = Normalizer(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) + normalizer_en = Normalizer( + input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False + ) normalizer_with_audio_en = ( NormalizerWithAudio(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) if RUN_AUDIO_BASED_TESTS diff --git a/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization.sh index 47f726ef4..705f4bdaf 100644 --- a/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization.sh @@ -1,11 +1,12 @@ #! /bin/sh -TEST_DIR=${1:-"/workspace/tests/en"} +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +TEST_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 echo "INPUT is $input" - cd /workspace/sparrowhawk/documentation/grammars + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization_cased.sh b/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization_cased.sh index 58407fcde..8c701e06a 100644 --- a/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization_cased.sh +++ b/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization_cased.sh @@ -1,12 +1,12 @@ #! /bin/sh -TEST_DIR=${1:-"/workspace/tests/en"} +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +TEST_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - echo "INPUT is $input" - - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh index 1969d64e9..7374b9ef9 100644 --- a/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh @@ -1,9 +1,11 @@ #! /bin/sh -TEST_DIR=${1:-"/workspace/tests/en"} +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +TEST_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/es/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/es/test_sparrowhawk_inverse_text_normalization.sh index f6a71c2cb..99eca7a88 100644 --- a/tests/nemo_text_processing/es/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/es/test_sparrowhawk_inverse_text_normalization.sh @@ -1,10 +1,12 @@ #! /bin/sh -PROJECT_DIR=/workspace/tests +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/es/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/es/test_sparrowhawk_normalization.sh index f3cb17fff..0c9e537fe 100644 --- a/tests/nemo_text_processing/es/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/es/test_sparrowhawk_normalization.sh @@ -1,10 +1,12 @@ #! /bin/sh -PROJECT_DIR=/workspace/tests +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/es_en/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/es_en/test_sparrowhawk_inverse_text_normalization.sh index 98554f619..6765d360c 100644 --- a/tests/nemo_text_processing/es_en/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/es_en/test_sparrowhawk_inverse_text_normalization.sh @@ -13,11 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -PROJECT_DIR=/workspace/tests +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/fr/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/fr/test_sparrowhawk_inverse_text_normalization.sh index 4ca12af7f..0a8336f68 100644 --- a/tests/nemo_text_processing/fr/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/fr/test_sparrowhawk_inverse_text_normalization.sh @@ -1,10 +1,16 @@ #! /bin/sh -PROJECT_DIR=/workspace/tests +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + +runtest () { + input=$1 + cd /workspace/sparrowhawk/documentation/grammars_itn_grammars_lower_cased # read test file while read testcase; do diff --git a/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh index 0713d6f34..009032118 100644 --- a/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh @@ -1,10 +1,12 @@ #! /bin/sh -PROJECT_DIR=/workspace/tests +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/hu/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/hu/test_sparrowhawk_normalization.sh index f30e53807..c487d3824 100644 --- a/tests/nemo_text_processing/hu/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/hu/test_sparrowhawk_normalization.sh @@ -1,10 +1,12 @@ #! /bin/sh -PROJECT_DIR=/workspace/tests +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/hy/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/hy/test_sparrowhawk_inverse_text_normalization.sh index a7c2fb80e..ebde5c9d8 100755 --- a/tests/nemo_text_processing/hy/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/hy/test_sparrowhawk_inverse_text_normalization.sh @@ -1,10 +1,12 @@ #! /bin/sh -PROJECT_DIR=/workspace/tests +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh index c8285be97..e43d90353 100644 --- a/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh @@ -1,10 +1,12 @@ #! /bin/sh -PROJECT_DIR=/workspace/tests +PGRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/pt/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/pt/test_sparrowhawk_inverse_text_normalization.sh index 74d8ddafd..511411e90 100755 --- a/tests/nemo_text_processing/pt/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/pt/test_sparrowhawk_inverse_text_normalization.sh @@ -1,10 +1,12 @@ #! /bin/sh -PROJECT_DIR=/workspace/tests +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/ru/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/ru/test_sparrowhawk_inverse_text_normalization.sh index 1c975e070..6df536467 100644 --- a/tests/nemo_text_processing/ru/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/ru/test_sparrowhawk_inverse_text_normalization.sh @@ -1,10 +1,12 @@ #! /bin/sh -PROJECT_DIR=/workspace/tests +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/sv/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/sv/test_sparrowhawk_inverse_text_normalization.sh index 2ed171ccf..350136c4a 100644 --- a/tests/nemo_text_processing/sv/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/sv/test_sparrowhawk_inverse_text_normalization.sh @@ -1,10 +1,12 @@ #! /bin/sh -PROJECT_DIR=/workspace/tests +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/sv/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/sv/test_sparrowhawk_normalization.sh index 71a6466fb..49480ee6a 100644 --- a/tests/nemo_text_processing/sv/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/sv/test_sparrowhawk_normalization.sh @@ -1,10 +1,12 @@ #! /bin/sh -PROJECT_DIR=/workspace/tests +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/vi/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/vi/test_sparrowhawk_inverse_text_normalization.sh index cc1defd46..751351cd4 100644 --- a/tests/nemo_text_processing/vi/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/vi/test_sparrowhawk_inverse_text_normalization.sh @@ -1,10 +1,12 @@ #! /bin/sh -PROJECT_DIR=/workspace/tests +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh index ade1027a7..708f54dd5 100644 --- a/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh @@ -1,10 +1,12 @@ #! /bin/sh -PROJECT_DIR=/workspace/tests +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh index 6c3a6598f..4cbbf0d0d 100644 --- a/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh @@ -1,10 +1,12 @@ #! /bin/sh -PROJECT_DIR=/workspace/tests +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/en"} runtest () { input=$1 - cd /workspace/sparrowhawk/documentation/grammars + echo "INPUT is $input" + cd ${GRAMMARS_DIR} # read test file while read testcase; do diff --git a/tools/text_processing_deployment/docker/launch.sh b/tools/text_processing_deployment/docker/launch.sh index 1ba641ac0..98fdff534 100644 --- a/tools/text_processing_deployment/docker/launch.sh +++ b/tools/text_processing_deployment/docker/launch.sh @@ -14,15 +14,19 @@ # See the License for the specific language governing permissions and # limitations under the License. +# this script runs Sparrowhawk tests in a docker container "locally" (not in CI/CD pipeline) + MODE=${1:-"interactive"} LANGUAGE=${2:-"en"} INPUT_CASE=${3:-"lower_cased"} +GRAMMARS=${4:-"tn_grammars"} # tn_grammars or itn_grammars SCRIPT_DIR=$(cd $(dirname $0); pwd) -GRAMMAR_DIR=${4:-${SCRIPT_DIR}"/.."} +GRAMMAR_DIR=${5:-${SCRIPT_DIR}"/.."} +CONFIG=${LANGUAGE}_${GRAMMARS}_${INPUT_CASE} -: ${CLASSIFY_DIR:="$GRAMMAR_DIR/$LANGUAGE/classify"} -: ${VERBALIZE_DIR:="$GRAMMAR_DIR/$LANGUAGE/verbalize"} -: ${CMD:=${5:-"/bin/bash"}} +: ${CLASSIFY_DIR:="$GRAMMAR_DIR/${CONFIG}/classify"} +: ${VERBALIZE_DIR:="$GRAMMAR_DIR/${CONFIG}/verbalize"} +: ${CMD:=${6:-"/bin/bash"}} MOUNTS="" MOUNTS+=" -v $CLASSIFY_DIR:/workspace/sparrowhawk/documentation/grammars/en_toy/classify" diff --git a/tools/text_processing_deployment/export_grammars.sh b/tools/text_processing_deployment/export_grammars.sh index d44f876e6..95f4edafb 100644 --- a/tools/text_processing_deployment/export_grammars.sh +++ b/tools/text_processing_deployment/export_grammars.sh @@ -36,9 +36,8 @@ LANGUAGE="en" # language, {'en', 'es', 'de','zh'} supports both TN and ITN, {'pt MODE="export" # default is one of {'export', 'interactive', 'test', 'ci'}. Default "export" OVERWRITE_CACHE="True" # Set to False to re-use .far files FORCE_REBUILD="False" # Set to True to re-build docker file -WHITELIST=None # Path to a whitelist file, if None the default will be used +WHITELIST="" # Path to a whitelist file, if None the default will be used FAR_PATH=$(pwd) # Path where the grammars should be written -SKIP_FAR_CREATION="False" for ARG in "$@" do @@ -52,7 +51,8 @@ do done -CACHE_DIR=${FAR_PATH}/${LANGUAGE} +CACHE_DIR=${FAR_PATH}/${LANGUAGE}_${GRAMMARS}_${INPUT_CASE} + echo "GRAMMARS = $GRAMMARS" echo "MODE = $MODE" echo "LANGUAGE = $LANGUAGE" @@ -62,10 +62,17 @@ echo "OVERWRITE_CACHE = $OVERWRITE_CACHE" echo "FORCE_REBUILD = $FORCE_REBUILD" echo "WHITELIST = $WHITELIST" +# check if WHITELIST file exists +if [[ ${WHITELIST} != "" ]] && [[ -f $WHITELIST ]]; then + WHITELIST="--whitelist=${WHITELIST} " + echo "[I] Whitelist file wasn't provided or doesn't exist, using default" +else + WHITELIST="" +fi + if [[ ${OVERWRITE_CACHE,,} == "true" ]] ; then OVERWRITE_CACHE="--overwrite_cache " - SKIP_FAR_CREATION="True" else OVERWRITE_CACHE="" fi @@ -73,14 +80,16 @@ fi CLASSIFY_FAR=${CACHE_DIR}"/classify/tokenize_and_classify.far" VERBALIZE_FAR=${CACHE_DIR}"/verbalize/verbalize.far" -if [[ -f $CLASSIFY_FAR ]] && [[ -f $VERBALIZE_FAR ]] && [[ ${OVERWRITE_CACHE} == "" ]]; then - SKIP_FAR_CREATION="True" - echo "Far files exists and OVERWRITE_CACHE is set to False" +# check if .far files do not exist +if [[ ! -f $CLASSIFY_FAR ]] || [[ ! -f $VERBALIZE_FAR ]] ; then + echo "[I] FSTs do not exist, will overwrite cache" + OVERWRITE_CACHE="--overwrite_cache " fi -if [[ ${SKIP_FAR_CREATION} != "True" ]]; then +if [[ ${OVERWRITE_CACHE} != "" ]] ; then + echo "[I] Exporting grammars" python3 pynini_export.py --output_dir=${FAR_PATH} --grammars=${GRAMMARS} --input_case=${INPUT_CASE} \ - --language=${LANGUAGE} --cache_dir=${CACHE_DIR} --whitelist=${WHITELIST} ${OVERWRITE_CACHE} || exit 1 + --language=${LANGUAGE} --cache_dir=${CACHE_DIR} ${WHITELIST} ${OVERWRITE_CACHE} || exit 1 fi if [[ ${FORCE_REBUILD,,} == "true" ]]; then @@ -90,15 +99,12 @@ fi find . -name "Makefile" -type f -delete - - - - if [[ ${MODE} == "test" ]] || [[ ${MODE} == "interactive" ]]; then MODE=${MODE}_${GRAMMARS} bash docker/build.sh $FORCE_REBUILD - bash docker/launch.sh $MODE $LANGUAGE $INPUT_CASE $FAR_PATH + bash docker/launch.sh $MODE $LANGUAGE $INPUT_CASE $GRAMMARS $FAR_PATH else + echo "done mode: $MODE" exit 0 fi diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 5db4531a5..aa3207e3e 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -235,7 +235,7 @@ def parse_args(): from nemo_text_processing.inverse_text_normalization.hy.verbalizers.verbalize import ( VerbalizeFst as ITNVerbalizeFst, ) - output_dir = os.path.join(args.output_dir, args.language) + output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") export_grammars( output_dir=output_dir, grammars=locals()[args.grammars]( diff --git a/tools/text_processing_deployment/sh_test.sh b/tools/text_processing_deployment/sh_test.sh index b66686991..2f5dd9e81 100644 --- a/tools/text_processing_deployment/sh_test.sh +++ b/tools/text_processing_deployment/sh_test.sh @@ -21,7 +21,7 @@ GRAMMARS="itn_grammars" # tn_grammars INPUT_CASE="lower_cased" # cased LANGUAGE="en" # language, {'en', 'es', 'de','zh'} supports both TN and ITN, {'pt', 'ru', 'fr', 'vi'} supports ITN only OVERWRITE_CACHE="False" # Set to False to re-use .far files -WHITELIST=None # Path to a whitelist file, if None the default will be used +WHITELIST="" # Path to a whitelist file, if None the default will be used FAR_PATH=$(pwd) # Path where the grammars should be written MODE="test_itn_grammars" @@ -46,13 +46,22 @@ echo "OVERWRITE_CACHE = $OVERWRITE_CACHE" echo "FORCE_REBUILD = $FORCE_REBUILD" echo "WHITELIST = $WHITELIST" +if [[ ${WHITELIST} != "" ]] && [[ -f $WHITELIST ]]; then + WHITELIST="--whitelist=${WHITELIST} " + echo "[I] Whitelist file wasn't provided or doesn't exist, using default" +else + WHITELIST="" +fi + bash export_grammars.sh --MODE="export" --GRAMMARS=$GRAMMARS --LANGUAGE=$LANGUAGE --INPUT_CASE=$INPUT_CASE \ - --FAR_PATH=$FAR_PATH --CACHE_DIR=$CACHE_DIR --OVERWRITE_CACHE=$OVERWRITE_CACHE --FORCE_REBUILD=$FORCE_REBUILD \ - --WHITELIST=$WHITELIST + --FAR_PATH=$FAR_PATH --CACHE_DIR=$CACHE_DIR --OVERWRITE_CACHE=$OVERWRITE_CACHE \ + --FORCE_REBUILD=$FORCE_REBUILD $WHITELIST CLASSIFY_FAR=${CACHE_DIR}"/classify/tokenize_and_classify.far" VERBALIZE_FAR=${CACHE_DIR}"/verbalize/verbalize.far" -cp $CLASSIFY_FAR /workspace/sparrowhawk/documentation/grammars/en_toy/classify/ -cp $VERBALIZE_FAR /workspace/sparrowhawk/documentation/grammars/en_toy/verbalize/ +CONFIG=${LANGUAGE}_${GRAMMARS}_${INPUT_CASE} + +cp $CLASSIFY_FAR /workspace/sparrowhawk/documentation/grammars_${CONFIG}/en_toy/classify/ +cp $VERBALIZE_FAR /workspace/sparrowhawk/documentation/grammars_${CONFIG}/en_toy/verbalize/ From bf43b19081411484faaca15ab514f7042f4516d2 Mon Sep 17 00:00:00 2001 From: David Sargsyan <66821320+davidks13@users.noreply.github.com> Date: Wed, 13 Mar 2024 04:23:47 +0400 Subject: [PATCH 10/90] Armenian TN (#137) * merged with main branch and fixed conflicts Signed-off-by: David Sargsyan * fixing conflicts Signed-off-by: David Sargsyan * fixing some more conflicts Signed-off-by: David Sargsyan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: David Sargsyan * fixed a minor issue Signed-off-by: David Sargsyan * deleted unused imports Signed-off-by: David Sargsyan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix: add "hy" language option for armenian Signed-off-by: Ara Yeroyan <60027241+Ara-Yeroyan@users.noreply.github.com> * added optional space for measurements after cardinals/decimals Signed-off-by: David Sargsyan * added Armenian dot Signed-off-by: David Sargsyan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: David Sargsyan Signed-off-by: Ara Yeroyan <60027241+Ara-Yeroyan@users.noreply.github.com> Signed-off-by: tbartley94 <90423858+tbartley94@users.noreply.github.com> Co-authored-by: David Sargsyan Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Ara Yeroyan <60027241+Ara-Yeroyan@users.noreply.github.com> Co-authored-by: tbartley94 <90423858+tbartley94@users.noreply.github.com> Signed-off-by: Alex Cui --- .../text_normalization/hy/__init__.py | 13 ++ .../text_normalization/hy/data/__init__.py | 13 ++ .../text_normalization/hy/data/currency.tsv | 52 ++++++++ .../hy/data/measurement_dates.tsv | 8 ++ .../hy/data/measurements.tsv | 44 +++++++ .../hy/data/numbers/digit.tsv | 9 ++ .../hy/data/numbers/quantities.tsv | 3 + .../hy/data/numbers/ties.tsv | 8 ++ .../hy/data/ordinal/digit.tsv | 5 + .../text_normalization/hy/data/time/hours.tsv | 26 ++++ .../hy/data/time/minutes.tsv | 61 +++++++++ .../text_normalization/hy/data/whitelist.tsv | 14 +++ .../text_normalization/hy/taggers/__init__.py | 13 ++ .../text_normalization/hy/taggers/cardinal.py | 88 +++++++++++++ .../text_normalization/hy/taggers/decimal.py | 87 +++++++++++++ .../text_normalization/hy/taggers/fraction.py | 37 ++++++ .../text_normalization/hy/taggers/measure.py | 99 +++++++++++++++ .../text_normalization/hy/taggers/money.py | 83 +++++++++++++ .../text_normalization/hy/taggers/ordinal.py | 74 +++++++++++ .../hy/taggers/punctuation.py | 36 ++++++ .../text_normalization/hy/taggers/time.py | 38 ++++++ .../hy/taggers/tokenize_and_classify.py | 116 ++++++++++++++++++ .../hy/taggers/whitelist.py | 48 ++++++++ .../text_normalization/hy/taggers/word.py | 30 +++++ .../text_normalization/hy/utils.py | 43 +++++++ .../hy/verbalizers/__init__.py | 13 ++ .../hy/verbalizers/cardinal.py | 34 +++++ .../hy/verbalizers/decimal.py | 63 ++++++++++ .../hy/verbalizers/fraction.py | 44 +++++++ .../hy/verbalizers/measure.py | 76 ++++++++++++ .../hy/verbalizers/money.py | 39 ++++++ .../hy/verbalizers/ordinal.py | 36 ++++++ .../text_normalization/hy/verbalizers/time.py | 42 +++++++ .../hy/verbalizers/verbalize.py | 60 +++++++++ .../hy/verbalizers/verbalize_final.py | 51 ++++++++ .../hy/verbalizers/whitelist.py | 34 +++++ .../text_normalization/hy/verbalizers/word.py | 38 ++++++ .../text_normalization/normalize.py | 5 +- .../text_normalization/run_evaluate.py | 2 +- .../test_cases_cardinal.txt | 12 ++ .../test_cases_decimal.txt | 9 ++ .../test_cases_fraction.txt | 10 ++ .../test_cases_measure.txt | 9 ++ .../test_cases_money.txt | 10 ++ .../test_cases_ordinal.txt | 11 ++ .../test_cases_time.txt | 9 ++ .../test_cases_whitelist.txt | 14 +++ .../test_cases_word.txt | 50 ++++++++ .../nemo_text_processing/hy/test_cardinal.py | 10 ++ tests/nemo_text_processing/hy/test_decimal.py | 11 ++ .../nemo_text_processing/hy/test_fraction.py | 11 ++ tests/nemo_text_processing/hy/test_measure.py | 11 ++ tests/nemo_text_processing/hy/test_money.py | 11 ++ tests/nemo_text_processing/hy/test_ordinal.py | 11 ++ .../hy/test_sparrowhawk_normalization.sh | 69 +++++++++++ tests/nemo_text_processing/hy/test_time.py | 11 ++ .../nemo_text_processing/hy/test_whitelist.py | 11 ++ tests/nemo_text_processing/hy/test_word.py | 11 ++ .../pynini_export.py | 4 + 59 files changed, 1888 insertions(+), 2 deletions(-) create mode 100644 nemo_text_processing/text_normalization/hy/__init__.py create mode 100644 nemo_text_processing/text_normalization/hy/data/__init__.py create mode 100644 nemo_text_processing/text_normalization/hy/data/currency.tsv create mode 100644 nemo_text_processing/text_normalization/hy/data/measurement_dates.tsv create mode 100644 nemo_text_processing/text_normalization/hy/data/measurements.tsv create mode 100644 nemo_text_processing/text_normalization/hy/data/numbers/digit.tsv create mode 100644 nemo_text_processing/text_normalization/hy/data/numbers/quantities.tsv create mode 100644 nemo_text_processing/text_normalization/hy/data/numbers/ties.tsv create mode 100644 nemo_text_processing/text_normalization/hy/data/ordinal/digit.tsv create mode 100644 nemo_text_processing/text_normalization/hy/data/time/hours.tsv create mode 100644 nemo_text_processing/text_normalization/hy/data/time/minutes.tsv create mode 100644 nemo_text_processing/text_normalization/hy/data/whitelist.tsv create mode 100644 nemo_text_processing/text_normalization/hy/taggers/__init__.py create mode 100644 nemo_text_processing/text_normalization/hy/taggers/cardinal.py create mode 100644 nemo_text_processing/text_normalization/hy/taggers/decimal.py create mode 100644 nemo_text_processing/text_normalization/hy/taggers/fraction.py create mode 100644 nemo_text_processing/text_normalization/hy/taggers/measure.py create mode 100644 nemo_text_processing/text_normalization/hy/taggers/money.py create mode 100644 nemo_text_processing/text_normalization/hy/taggers/ordinal.py create mode 100644 nemo_text_processing/text_normalization/hy/taggers/punctuation.py create mode 100644 nemo_text_processing/text_normalization/hy/taggers/time.py create mode 100644 nemo_text_processing/text_normalization/hy/taggers/tokenize_and_classify.py create mode 100644 nemo_text_processing/text_normalization/hy/taggers/whitelist.py create mode 100644 nemo_text_processing/text_normalization/hy/taggers/word.py create mode 100644 nemo_text_processing/text_normalization/hy/utils.py create mode 100644 nemo_text_processing/text_normalization/hy/verbalizers/__init__.py create mode 100644 nemo_text_processing/text_normalization/hy/verbalizers/cardinal.py create mode 100644 nemo_text_processing/text_normalization/hy/verbalizers/decimal.py create mode 100644 nemo_text_processing/text_normalization/hy/verbalizers/fraction.py create mode 100644 nemo_text_processing/text_normalization/hy/verbalizers/measure.py create mode 100644 nemo_text_processing/text_normalization/hy/verbalizers/money.py create mode 100644 nemo_text_processing/text_normalization/hy/verbalizers/ordinal.py create mode 100644 nemo_text_processing/text_normalization/hy/verbalizers/time.py create mode 100644 nemo_text_processing/text_normalization/hy/verbalizers/verbalize.py create mode 100644 nemo_text_processing/text_normalization/hy/verbalizers/verbalize_final.py create mode 100644 nemo_text_processing/text_normalization/hy/verbalizers/whitelist.py create mode 100644 nemo_text_processing/text_normalization/hy/verbalizers/word.py create mode 100644 tests/nemo_text_processing/hy/data_text_normalization/test_cases_cardinal.txt create mode 100644 tests/nemo_text_processing/hy/data_text_normalization/test_cases_decimal.txt create mode 100644 tests/nemo_text_processing/hy/data_text_normalization/test_cases_fraction.txt create mode 100644 tests/nemo_text_processing/hy/data_text_normalization/test_cases_measure.txt create mode 100644 tests/nemo_text_processing/hy/data_text_normalization/test_cases_money.txt create mode 100644 tests/nemo_text_processing/hy/data_text_normalization/test_cases_ordinal.txt create mode 100644 tests/nemo_text_processing/hy/data_text_normalization/test_cases_time.txt create mode 100644 tests/nemo_text_processing/hy/data_text_normalization/test_cases_whitelist.txt create mode 100644 tests/nemo_text_processing/hy/data_text_normalization/test_cases_word.txt create mode 100755 tests/nemo_text_processing/hy/test_sparrowhawk_normalization.sh diff --git a/nemo_text_processing/text_normalization/hy/__init__.py b/nemo_text_processing/text_normalization/hy/__init__.py new file mode 100644 index 000000000..9df65818d --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hy/data/__init__.py b/nemo_text_processing/text_normalization/hy/data/__init__.py new file mode 100644 index 000000000..9df65818d --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hy/data/currency.tsv b/nemo_text_processing/text_normalization/hy/data/currency.tsv new file mode 100644 index 000000000..a1d8e03dd --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/data/currency.tsv @@ -0,0 +1,52 @@ +aed դիրհամ +ARS արգենտինական պեսո +֏ դրամ +֏ հայկական դրամ +a$ ավստրալական դոլար +br բելառուսական ռուբլի +₿ բիթքոին +£ ֆունտ +£ բրիտանական ֆունտ +ca$ կանադական դոլար +元 չինական յեն +元 յեն +kr դանիական կրոն +$ դոլար +€ եվրո +₾ վրացական լարի +₾ լարի +₹ հնդկական ռուփի +₹ ռուփի +﷼ պարսկական ռիալ +﷼ ռիալ +₪ իսրայելական շեկել +₪ շեկել +¥ ճապոնական յեն +¥ յեն +₸ ղազախական տենգե +₸ տենգե +som ղրղզական սոմ +som սոմ +ل.ل լիբանանյան ֆունտ +ل.ل լիբանանյան լիրա +nz$ նորզելանդական դոլլր +kr նորվեգական կրոն +zł լեհական զլոտի +zł զլոտի +£ ֆունտ ստերլինգ +£ ֆունտ +₽ ռուսական ռուբլի +₽ ռուբլի +rsd սերբական դինար +s$ սինգապուրի դոլար +₩ կորեական վոն +kr շվեդական կրոն +chf շվեյցարական ֆրանկ +£s սիրիական ֆունտ +₺ թուրքական լիրա +₴ ուկրաինական գրիվնա +$ ամերիկյան դոլար +$ ամն դոլար +som ուզբեկական սոմ +₩ վոն +¥ յեն \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hy/data/measurement_dates.tsv b/nemo_text_processing/text_normalization/hy/data/measurement_dates.tsv new file mode 100644 index 000000000..00ac92f9d --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/data/measurement_dates.tsv @@ -0,0 +1,8 @@ +թ. թվական +թթ. թվականներ +դ. դար +դդ. դարեր +թ․ թվական +թթ․ թվականներ +դ․ դար +դդ․ դարեր \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hy/data/measurements.tsv b/nemo_text_processing/text_normalization/hy/data/measurements.tsv new file mode 100644 index 000000000..37fe39154 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/data/measurements.tsv @@ -0,0 +1,44 @@ +°F ֆարենհայթ +°C ցելսիուս +K կելվին +կմ կիլոմետր +կմ/ժ կիլոմետր ժամ +մ մետր +մ/ժ մետր ժամ +սմ սանտիմետր +մմ միլիմետր +հա հեկտար +մղն մղոն +մ² քառակուսի մետր +կմ² քառակուսի կիլոմետր +% տոկոս +Հց հերց +կՎտ կիլովատտ +կՎտ կիլո վատտ +կՎ/Ժ կիլովատտ ժամ +Վտ/ժ վատտ ժամ +Վտ վատտ +ձ.ու. ձիաուժ +ձ․ու․ ձիաուժ +մգ միլիգրամ +կգ կիլոգրամ +Վ վոլտ +ժ ժամ +վ վայրկյան +ր րոպե +մ³ խորանարդ մետր +գ գրամ +տ տոննա +կբ կիլոբայթ +մբ մեգաբայթ +գբ գիգաբայթ +գբ գեգաբայթ +տբ տերաբայթ +տբ տեռաբայթ +կՎ կիլո վոլտ +մՎ մեգա վոլտ +Ա ամպեր +մԱ միլի ամպեր +մվ միլի վայրկյան +դմ դեցիմետր +սմ² քառակուսի սանտիմետր diff --git a/nemo_text_processing/text_normalization/hy/data/numbers/digit.tsv b/nemo_text_processing/text_normalization/hy/data/numbers/digit.tsv new file mode 100644 index 000000000..42feb82d5 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/data/numbers/digit.tsv @@ -0,0 +1,9 @@ +մեկ 1 +երկու 2 +երեք 3 +չորս 4 +հինգ 5 +վեց 6 +յոթ 7 +ութ 8 +ինը 9 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hy/data/numbers/quantities.tsv b/nemo_text_processing/text_normalization/hy/data/numbers/quantities.tsv new file mode 100644 index 000000000..8b53443af --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/data/numbers/quantities.tsv @@ -0,0 +1,3 @@ +միլիոն +միլիարդ +տրիլիոն \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hy/data/numbers/ties.tsv b/nemo_text_processing/text_normalization/hy/data/numbers/ties.tsv new file mode 100644 index 000000000..0faae0a9f --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/data/numbers/ties.tsv @@ -0,0 +1,8 @@ +քսան 2 +երեսուն 3 +քառասուն 4 +հիսուն 5 +վաթսուն 6 +յոթանասուն 7 +ութսուն 8 +իննսուն 9 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hy/data/ordinal/digit.tsv b/nemo_text_processing/text_normalization/hy/data/ordinal/digit.tsv new file mode 100644 index 000000000..92322351a --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/data/ordinal/digit.tsv @@ -0,0 +1,5 @@ +հինգերորդ հինգ +վեցերորդ վեց +յոթերորդ յոթ +ութերորդ ութ +իններորդ ինը \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hy/data/time/hours.tsv b/nemo_text_processing/text_normalization/hy/data/time/hours.tsv new file mode 100644 index 000000000..01656d4e8 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/data/time/hours.tsv @@ -0,0 +1,26 @@ +մեկ 01 +երկուսն 02 +երեք 03 +չորս 04 +հինգ 05 +վեց 06 +յոթ 07 +ութ 08 +ինն 09 +տասն 10 +տասնմեկ 11 +տասնմեկն 11 +տասներկուսն 12 +տասներեք 13 +տասնչորս 14 +տասնհինգ 15 +տասնվեց 16 +տասնյոթ 17 +տասնութ 18 +տասնինն 19 +քսան 20 +քսանմեկ 21 +քսաներկուսն 22 +քսաներեք 23 +քսանչորս 24 +քսանչորս 24 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hy/data/time/minutes.tsv b/nemo_text_processing/text_normalization/hy/data/time/minutes.tsv new file mode 100644 index 000000000..5035ad406 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/data/time/minutes.tsv @@ -0,0 +1,61 @@ +մեկ 01 +երկու 02 +երեք 03 +չորս 04 +հինգ 05 +վեց 06 +յոթ 07 +ութ 08 +ինը 09 +տաս 10 +տասնմեկ 11 +տասնմեկն 11 +տասներկու 12 +տասներեք 13 +տասնչորս 14 +տասնհինգ 15 +տասնվեց 16 +տասնյոթ 17 +տասնութ 18 +տասնինը 19 +քսան 20 +քսանմեկ 21 +քսաներկու 22 +քսաներեք 23 +քսանչորս 24 +քսանհինգ 25 +քսանվեց 26 +քսանյոթ 27 +քսանութ 28 +քսանինը 29 +երեսուն 30 +երեսունմեկ 31 +երեսուներկու 32 +երեսուներեք 33 +երեսունչորս 34 +երեսունհինգ 35 +երեսունվեց 36 +երեսունյոթ 37 +երեսունութ 38 +երեսունինը 39 +քառասուն 41 +քառասունմեկ 41 +քառասուներկու 42 +քառասուներեք 43 +քառասունչորս 44 +քառասունհինգ 45 +քառասունվեց 46 +քառասունյոթ 47 +քառասունութ 48 +քառասունինը 49 +հիսուն 50 +հիսունմեկ 51 +հիսուներկու 52 +հիսուներեք 53 +հիսունչորս 54 +հիսունհինգ 55 +հիսունվեց 56 +հիսունյոթ 57 +հիսունութ 58 +հիսունինը 59 +զրո֊զրո 00 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hy/data/whitelist.tsv b/nemo_text_processing/text_normalization/hy/data/whitelist.tsv new file mode 100644 index 000000000..8ef036e32 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/data/whitelist.tsv @@ -0,0 +1,14 @@ +ս.թ. սույն թվականի +մ.թ.ա. մեր թվարկությունից առաջ +մ.թ. մեր թվարկություն +Ք.ա. քրիստոսից առաջ +Ք.հ. քրիստոսից հետո +Ք.ծ.ա. քրիստոսի ծննդից առաջ +Ք.ծ.հ. քրիստոսի ծննդից հետո +ս․թ․ սույն թվականի +մ․թ․ա․ մեր թվարկությունից առաջ +մ․թ․ մեր թվարկություն +Ք․ա․ քրիստոսից առաջ +Ք․հ․ քրիստոսից հետո +Ք․ծ․ա․ քրիստոսի ծննդից առաջ +Ք․ծ․հ․ քրիստոսի ծննդից հետո diff --git a/nemo_text_processing/text_normalization/hy/taggers/__init__.py b/nemo_text_processing/text_normalization/hy/taggers/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hy/taggers/cardinal.py b/nemo_text_processing/text_normalization/hy/taggers/cardinal.py new file mode 100644 index 000000000..8a418f117 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/taggers/cardinal.py @@ -0,0 +1,88 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.hy.utils import get_abs_path + + +class CardinalFst(GraphFst): + """ + Finite state transducer for classifying cardinals, e.g. + 55 -> cardinal { integer: "հիսունհինգ" } + """ + + def __init__(self): + super().__init__(name="cardinal", kind="classify") + + zero = pynini.string_map([("0", "զրո")]) + digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() + digits_no_one = (NEMO_DIGIT - "1") @ digits + + ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")).invert() + ties_unit = digits + double_digits = (pynini.cross("1", "տասը") | ties) + pynutil.delete("0") | ( + pynini.cross("1", "տասն") | ties + ) + ties_unit + + self.all_double_digits = double_digits.optimize() + + one_to_all_tens = digits | double_digits + self.one_to_all_tens = one_to_all_tens.optimize() + + hundreds_parts = (pynutil.delete("0") + insert_space + digits) | (insert_space + double_digits) + one_hundreds = pynini.cross("1", "հարյուր") + (pynutil.delete("00") | hundreds_parts) + multiple_hundreds = (digits_no_one + insert_space + pynutil.insert("հարյուր")) + ( + pynutil.delete("00") | hundreds_parts + ) + all_hundreds = one_hundreds | multiple_hundreds + self.all_hundreds = all_hundreds.optimize() + + delete_separator = pynini.closure(delete_space, 0, 1) + one_thousand = pynini.cross("1", "հազար") + delete_separator + other_thousands = ( + (digits_no_one | double_digits | all_hundreds) + insert_space + pynutil.insert("հազար") + delete_separator + ) + all_thousands = ( + ((one_thousand | other_thousands) + pynutil.delete("000")) + | (one_thousand + pynutil.delete("00") + insert_space + digits) + | (other_thousands + pynutil.delete("00") + insert_space + digits) + | ((one_thousand | other_thousands) + pynutil.delete("0") + insert_space + double_digits) + | ((one_thousand | other_thousands) + insert_space + all_hundreds) + ) + + digits_to_hundreds = digits | double_digits | all_hundreds + digits_to_thousands = digits | double_digits | all_hundreds | all_thousands + millions_components = pynini.closure(delete_separator + pynini.closure(NEMO_DIGIT, 3), 2) + delete_zeros = pynini.closure(pynutil.delete("0"), 0, 6) + all_millions = (digits_to_hundreds + insert_space + pynutil.insert("միլիոն")) + ( + millions_components @ (delete_zeros + pynini.closure(insert_space + digits_to_thousands, 0, 1)) + ) + + digits_to_millions = digits_to_thousands | all_millions + billions_components = pynini.closure(delete_separator + pynini.closure(NEMO_DIGIT, 3), 3) + delete_zeros = pynini.closure(pynutil.delete("0"), 0, 9) + all_billions = (digits_to_hundreds + insert_space + pynutil.insert("միլիարդ")) + ( + billions_components @ (delete_zeros + pynini.closure(insert_space + digits_to_millions, 0, 1)) + ) + + final_graph = zero | digits | double_digits | all_hundreds | all_thousands | all_millions | all_billions + self.all_nums_no_tokens = final_graph + + final_graph = pynutil.insert("integer: \"") + final_graph + pynutil.insert("\"") + self.final_graph = final_graph + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hy/taggers/decimal.py b/nemo_text_processing/text_normalization/hy/taggers/decimal.py new file mode 100644 index 000000000..ad9474670 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/taggers/decimal.py @@ -0,0 +1,87 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, + NEMO_SIGMA, + NEMO_SPACE, + GraphFst, + insert_space, +) +from nemo_text_processing.text_normalization.hy.utils import get_abs_path + + +def get_quantity(decimal_graph: "pynini.FstLike", cardinal_graph: "pynini.FstLike") -> "pynini.FstLike": + """ + Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, + e.g. 2 միլիոն -> integer_part: "երկու" quantity: "միլիոն" + e.g. 2․4 միլիոն -> integer_part: "երկու" fractional_part: "չորս" quantity: "միլիոն" + Args: + decimal_graph: DecimalFST + cardinal_graph: CardinalFST + """ + quantities = pynini.string_file(get_abs_path("data/numbers/quantities.tsv")) + delete_separator = pynini.closure(pynutil.delete(NEMO_SPACE), 0, 1) + numbers = pynini.closure(NEMO_DIGIT, 1, 6) @ cardinal_graph + numbers = pynini.cdrewrite(pynutil.delete(delete_separator), "", "", NEMO_SIGMA) @ numbers + + res = ( + pynutil.insert('integer_part: "') + + numbers + + pynutil.insert('"') + + NEMO_SPACE + + pynutil.insert('quantity: "') + + quantities + + pynutil.insert('"') + ) + res |= decimal_graph + NEMO_SPACE + pynutil.insert('quantity: "') + quantities + pynutil.insert('"') + return res + + +class DecimalFst(GraphFst): + """ + Finite state transducer for classifying decimal, e.g. + 554 միլիարդ -> decimal { integer_part: "հինգ հարյուր հիսունչորս" quantity: "միլիարդ" } + Args: + cardinal: CardinalFst + deterministic is not necessary right now + TODO make deterministic make sense + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="decimal", kind="classify", deterministic=deterministic) + + graph = cardinal.one_to_all_tens + + graph = graph.optimize() + + delete_separator = pynutil.delete(".") | pynutil.delete("․") + optional_graph_negative = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", '"true" '), 0, 1) + + graph_fractional = pynutil.insert('fractional_part: "') + graph + pynutil.insert('"') + + integers = cardinal.all_nums_no_tokens + graph_integer = pynutil.insert('integer_part: "') + integers + pynutil.insert('"') + final_graph_wo_sign = graph_integer + delete_separator + insert_space + graph_fractional + + final_graph_wo_negative = final_graph_wo_sign | get_quantity(final_graph_wo_sign, integers) + self.final_graph_wo_negative = final_graph_wo_negative.optimize() + + final_graph = optional_graph_negative + final_graph_wo_negative + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hy/taggers/fraction.py b/nemo_text_processing/text_normalization/hy/taggers/fraction.py new file mode 100644 index 000000000..6cf4c1fee --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/taggers/fraction.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import INPUT_LOWER_CASED, GraphFst + + +class FractionFst(GraphFst): + """ + Finite state transducer for classifying fraction, e.g. + "4/5" -> tokens { fraction { numerator: "չորս" denominator: "հինգերորդ" } } + "1/6" -> tokens { fraction { numerator: "մեկ" denominator: "վեցերորդ" } } + """ + + def __init__(self, cardinal: GraphFst, ordinal: GraphFst, input_case: str = INPUT_LOWER_CASED): + super().__init__(name="fraction", kind="classify") + cardinal_graph = cardinal.all_nums_no_tokens + ordinal_graph = ordinal.denominator_graph + + numerator = pynutil.insert("numerator: \"") + cardinal_graph + pynutil.insert("\"") + denominator = pynutil.insert(" denominator: \"") + ordinal_graph + pynutil.insert("\"") + + final_graph = numerator + pynutil.delete("/") + denominator + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hy/taggers/measure.py b/nemo_text_processing/text_normalization/hy/taggers/measure.py new file mode 100644 index 000000000..004b76c07 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/taggers/measure.py @@ -0,0 +1,99 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_LOWER_CASED, + NEMO_SIGMA, + TO_LOWER, + GraphFst, + delete_extra_space, +) +from nemo_text_processing.text_normalization.hy.utils import get_abs_path + + +class MeasureFst(GraphFst): + """ + Finite state transducer for classifying measure + e.g. 52 կգ -> measure { cardinal { integer: "հիսուներկու" } units: "կիլոգրամ" } + + Args: + cardinal: CardinalFst + decimal: DecimalFst + """ + + def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPUT_LOWER_CASED): + super().__init__(name="measure", kind="classify") + + cardinal_graph = cardinal.all_nums_no_tokens + + casing_graph = pynini.closure(TO_LOWER | NEMO_SIGMA) + + graph_measurements_unit = pynini.string_file(get_abs_path("data/measurements.tsv")) + graph_measurements_unit = pynini.compose(casing_graph, graph_measurements_unit) + + graph_measurements_dates_unit = pynini.string_file(get_abs_path("data/measurement_dates.tsv")) + graph_measurements_dates_unit = pynini.compose(casing_graph, graph_measurements_dates_unit) + + measurements_unit = pynutil.insert("units: \"") + graph_measurements_unit + pynutil.insert("\"") + + measurements_dates_unit = pynutil.insert("units: \"") + graph_measurements_dates_unit + pynutil.insert("\"") + + subgraph_decimal = ( + pynutil.insert("decimal { ") + + decimal.final_graph_wo_negative + + pynutil.insert(" }") + + pynini.closure(delete_extra_space, 0, 1) + + measurements_unit + ) + + subgraph_cardinal = ( + pynutil.insert("cardinal { ") + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" }") + + pynini.closure(delete_extra_space, 0, 1) + + measurements_unit + ) + + subgraph_cardinal_dates = ( + pynutil.insert("cardinal { ") + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" }") + + pynini.closure(delete_extra_space, 0, 1) + + measurements_dates_unit + ) + + subgraph_cardinal_dates |= ( + pynutil.insert("cardinal { ") + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("ից") + + pynutil.delete("-") + + pynutil.insert(' ') + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" }") + + pynini.closure(delete_extra_space, 0, 1) + + measurements_dates_unit + ) + + final_graph = subgraph_decimal | subgraph_cardinal | subgraph_cardinal_dates + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hy/taggers/money.py b/nemo_text_processing/text_normalization/hy/taggers/money.py new file mode 100644 index 000000000..e84bb5b0c --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/taggers/money.py @@ -0,0 +1,83 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space +from nemo_text_processing.text_normalization.hy.utils import get_abs_path + + +class MoneyFst(GraphFst): + """ + Finite state transducer for classifying money, e.g. + "15 $" -> money { "տասնհինգ դոլար" } + + Args: + cardinal: CardinalFst + decimal: DecimalFst + """ + + def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True): + super().__init__(name="money", kind="classify", deterministic=deterministic) + cardinal_graph = cardinal.final_graph + decimal_graph = decimal.fst + + unit = pynini.string_file(get_abs_path("data/currency.tsv")) + + weighted_delimiter = pynutil.add_weight(pynutil.delete(NEMO_SPACE), -100) + optional_delimiter = pynini.closure(weighted_delimiter, 0, 1) + graph_unit_singular = optional_delimiter + pynutil.insert(" currency: \"") + unit + pynutil.insert("\"") + + graph_decimal = decimal_graph + graph_unit_singular + graph_cardinal = cardinal_graph + graph_unit_singular + + tagger_graph = graph_cardinal | graph_decimal + + integer = pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + integer_cardinal = pynutil.delete("integer: ") + integer + integer_part = pynutil.delete("integer_part: ") + integer + + unit = ( + pynutil.delete("currency: ") + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + unit = pynini.accep(NEMO_SPACE) + unit + + verbalizer_graph_cardinal = integer_cardinal + unit + + optional_fractional_part = pynini.closure(pynutil.delete("fractional_part: ") + integer, 0, 1) + optional_quantity = pynini.closure(pynini.accep(NEMO_SPACE) + pynutil.delete("quantity: ") + integer, 0, 1) + + verbalizer_graph_decimal = ( + pynutil.delete('decimal { ') + + integer_part + + delete_space + + pynutil.insert(" ամբողջ ") + + optional_fractional_part + + delete_space + + optional_quantity + + delete_space + + pynutil.delete(" }") + + unit + ) + + verbalizer_graph = verbalizer_graph_cardinal | verbalizer_graph_decimal + + self.final_graph = (tagger_graph @ verbalizer_graph).optimize() + self.fst = self.add_tokens( + pynutil.insert("integer_part: \"") + self.final_graph + pynutil.insert("\"") + ).optimize() diff --git a/nemo_text_processing/text_normalization/hy/taggers/ordinal.py b/nemo_text_processing/text_normalization/hy/taggers/ordinal.py new file mode 100644 index 000000000..ec486bd92 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/taggers/ordinal.py @@ -0,0 +1,74 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for classifying ordinal, e.g. + 5-րդ -> ordinal { integer: "հինգերորդ" } + 1-ին -> ordinal { integer: "առաջին" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="ordinal", kind="classify", deterministic=deterministic) + + cardinal_graph = cardinal.all_nums_no_tokens + cardinal_format = pynini.closure(NEMO_DIGIT | pynini.accep(",")) + rd = pynini.accep("-րդ") + first_format = ( + pynini.closure(cardinal_format + (NEMO_DIGIT - "1"), 0, 1) + pynini.accep("1") + pynutil.delete("-ին") + ) + second_format = pynini.closure(cardinal_format + (NEMO_DIGIT - "2"), 0, 1) + pynini.accep("2") + third_format = pynini.closure(cardinal_format + (NEMO_DIGIT - "1"), 0, 1) + pynini.accep("3") + fourth_format = pynini.closure(cardinal_format + (NEMO_DIGIT - "1"), 0, 1) + pynini.accep("4") + th_format = pynini.closure( + (NEMO_DIGIT - "1" - "2" - "3" - "4") | (cardinal_format + "1" + NEMO_DIGIT) | cardinal_format, 1 + ) + + first = pynini.cross("1", "առաջին") + second = pynini.cross("2", "երկրորդ") + third = pynini.cross("3", "երրորդ") + fourth = pynini.cross("4", "չորրորդ") + + special_denominator_graph = second_format @ second | third_format @ third | fourth_format @ fourth + + self.denominator_graph = ( + pynutil.add_weight(first_format @ first, 1) + | pynutil.add_weight(special_denominator_graph, 1) + | pynutil.add_weight(th_format @ cardinal_graph + pynutil.insert("երորդ"), 1.5) + ).optimize() + + special_ordinals_graph = ( + (second_format + pynutil.delete(rd)) @ second + | (third_format + pynutil.delete(rd)) @ third + | (fourth_format + pynutil.delete(rd)) @ fourth + ) + + self.graph = ( + pynutil.add_weight(first_format @ first, 1) + | pynutil.add_weight(special_ordinals_graph, 1) + | pynutil.add_weight((th_format + pynutil.delete(rd)) @ cardinal_graph + pynutil.insert("երորդ"), 1.5) + ).optimize() + + final_graph = pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hy/taggers/punctuation.py b/nemo_text_processing/text_normalization/hy/taggers/punctuation.py new file mode 100644 index 000000000..8f6852cbd --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/taggers/punctuation.py @@ -0,0 +1,36 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + + +class PunctuationFst(GraphFst): + """ + Finite state transducer for classifying punctuation + e.g. , -> tokens { name: "," } + + """ + + def __init__(self): + super().__init__(name="punctuation", kind="classify") + + s = "!#$%&\'()*+,-./:;<=>?@^_`{|}~" + punct = pynini.union(*s) + + graph = pynutil.insert("name: \"") + punct + pynutil.insert("\"") + + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hy/taggers/time.py b/nemo_text_processing/text_normalization/hy/taggers/time.py new file mode 100644 index 000000000..5c555456e --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/taggers/time.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import INPUT_LOWER_CASED, GraphFst +from nemo_text_processing.text_normalization.hy.utils import get_abs_path + + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time, e.g. + "15:52" -> time { hours: "տասնհինգ" minutes: "հիսուներկու" } + """ + + def __init__(self, input_case: str = INPUT_LOWER_CASED): + super().__init__(name="time", kind="classify") + hours = pynini.string_file(get_abs_path('data/time/hours.tsv')).invert() + minutes = pynini.string_file(get_abs_path('data/time/minutes.tsv')).invert() + + graph_hours = pynutil.insert("hours: \"") + hours + pynutil.insert("\"") + graph_minutes = pynutil.insert(" minutes: \"") + minutes + pynutil.insert("\"") + + final_graph = graph_hours + pynutil.delete(":") + graph_minutes + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hy/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hy/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..08e121f86 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/taggers/tokenize_and_classify.py @@ -0,0 +1,116 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_LOWER_CASED, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.text_normalization.hy.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.hy.taggers.decimal import DecimalFst +from nemo_text_processing.text_normalization.hy.taggers.fraction import FractionFst +from nemo_text_processing.text_normalization.hy.taggers.measure import MeasureFst +from nemo_text_processing.text_normalization.hy.taggers.money import MoneyFst +from nemo_text_processing.text_normalization.hy.taggers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.hy.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.hy.taggers.time import TimeFst +from nemo_text_processing.text_normalization.hy.taggers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.hy.taggers.word import WordFst +from nemo_text_processing.utils.logging import logger + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__( + self, + cache_dir: str = None, + whitelist: str = None, + deterministic: bool = False, + overwrite_cache: bool = False, + input_case: str = INPUT_LOWER_CASED, + ): + super().__init__(name="tokenize_and_classify", kind="classify") + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"_hy_itn_{input_case}.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logger.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logger.info(f"Creating ClassifyFst grammars.") + + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + ordinal = OrdinalFst(cardinal) + ordinal_graph = ordinal.fst + + fraction = FractionFst(cardinal=cardinal, ordinal=ordinal) + fraction_graph = fraction.fst + + decimal = DecimalFst(cardinal) + decimal_graph = decimal.fst + + measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst + word_graph = WordFst().fst + time_graph = TimeFst().fst + money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst + punct_graph = PunctuationFst().fst + whitelist_graph = WhiteListFst( + input_case=input_case, deterministic=deterministic, input_file=whitelist + ).fst + + classify = ( + pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(time_graph, 1.1) + | pynutil.add_weight(decimal_graph, 1.1) + | pynutil.add_weight(measure_graph, 0.9) + | pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(ordinal_graph, 1.1) + | pynutil.add_weight(fraction_graph, 1.09) + | pynutil.add_weight(money_graph, 1.1) + | pynutil.add_weight(word_graph, 100) + ) + + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + token_plus_punct = ( + pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + ) + + graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) + graph = delete_space + graph + delete_space + + self.fst = graph.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) diff --git a/nemo_text_processing/text_normalization/hy/taggers/whitelist.py b/nemo_text_processing/text_normalization/hy/taggers/whitelist.py new file mode 100644 index 000000000..4b68bc011 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/taggers/whitelist.py @@ -0,0 +1,48 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space, load_labels +from nemo_text_processing.text_normalization.hy.utils import get_abs_path + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for classifying whitelist, e.g. + մ.թ.ա. -> tokens { name: "մեր թվարկությունից առաջ" } + This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv". + + Args: + input_file: path to a file with whitelist replacements + """ + + def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): + super().__init__(name="whitelist", kind="classify") + + def _get_whitelist_graph(file): + whitelist = load_labels(file) + whitelist = [[x, y] for x, y in whitelist] + + graph = pynini.string_map(whitelist) + return graph + + whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")) + if input_file: + whitelist_provided = _get_whitelist_graph(input_file) + whitelist |= whitelist_provided + + graph = pynutil.insert("name: \"") + convert_space(whitelist) + pynutil.insert("\"") + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hy/taggers/word.py b/nemo_text_processing/text_normalization/hy/taggers/word.py new file mode 100644 index 000000000..fcacc6c48 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/taggers/word.py @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying word. Considers sentence boundary exceptions. + e.g. բարև -> tokens { name: "բարև" } + """ + + def __init__(self): + super().__init__(name="word", kind="classify") + word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + self.fst = word.optimize() diff --git a/nemo_text_processing/text_normalization/hy/utils.py b/nemo_text_processing/text_normalization/hy/utils.py new file mode 100644 index 000000000..7abe91e9e --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/utils.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import os + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + +def load_labels(abs_path): + """ + loads relative path file as dictionary + + Args: + abs_path: absolute path + + Returns dictionary of mappings + """ + label_tsv = open(abs_path) + labels = list(csv.reader(label_tsv, delimiter="\t")) + label_tsv.close() + return labels diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/__init__.py b/nemo_text_processing/text_normalization/hy/verbalizers/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/hy/verbalizers/cardinal.py new file mode 100644 index 000000000..9b4eeb588 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/verbalizers/cardinal.py @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal, e.g. + cardinal { integer: "հիսունհինգ" } -> հիսունհինգ + + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic) + + number = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + final_graph = number + self.numbers = final_graph + self.fst = self.delete_tokens(final_graph).optimize() diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/decimal.py b/nemo_text_processing/text_normalization/hy/verbalizers/decimal.py new file mode 100644 index 000000000..bdea252e8 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/verbalizers/decimal.py @@ -0,0 +1,63 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_preserve_order, + delete_space, + insert_space, +) + + +class DecimalFst(GraphFst): + """ + Finite state transducer for verbalizing decimal, e.g. + decimal { integer_part: "հինգ" quantity: "միլիոն" } -> 5 միլիոն + + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="decimal", kind="classify", deterministic=deterministic) + + integer = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + fractional_default = ( + pynutil.delete("fractional_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) + + conjunction = pynutil.insert(" ամբողջ ") + fractional = conjunction + fractional_default + + quantity = ( + delete_space + + insert_space + + pynutil.delete("quantity: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + optional_quantity = pynini.closure(quantity, 0, 1) + + graph = pynini.union((integer + quantity), (integer + delete_space + fractional + optional_quantity)) + + self.numbers_only_quantity = pynini.union( + (integer + quantity), (integer + delete_space + fractional + quantity) + ).optimize() + + graph += delete_preserve_order + self.numbers = graph + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/fraction.py b/nemo_text_processing/text_normalization/hy/verbalizers/fraction.py new file mode 100644 index 000000000..ff7dbd259 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/verbalizers/fraction.py @@ -0,0 +1,44 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_space + + +class FractionFst(GraphFst): + """ + Finite state transducer for verbalizing fraction + e.g. fraction { numerator: "երկու" denominator: "երրորդ" } } -> 2/3 + + """ + + def __init__(self): + super().__init__(name="fraction", kind="verbalize") + numerator = pynutil.delete("numerator: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + + denominator = ( + pynutil.insert(' ') + + pynutil.delete("denominator: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + suffix = pynini.cdrewrite(pynini.cross("ըերորդ", "ներորդ"), "", "[EOS]", NEMO_SIGMA).optimize() + + graph = (numerator + delete_space + pynini.compose(denominator, suffix)).optimize() + self.numbers = graph.optimize() + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/measure.py b/nemo_text_processing/text_normalization/hy/verbalizers/measure.py new file mode 100644 index 000000000..a844f643e --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/verbalizers/measure.py @@ -0,0 +1,76 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_NOT_QUOTE, GraphFst, delete_space + + +class MeasureFst(GraphFst): + """ + Finite state transducer for verbalizing measure, e.g. + measure { cardinal { integer: "վաթսուն" } units: "կիլոգրամ" } -> վաթսուն կիլոգրամ + + Args: + decimal: DecimalFst + cardinal: CardinalFst + """ + + def __init__(self, decimal: GraphFst, cardinal: GraphFst): + super().__init__(name="measure", kind="verbalize") + optional_sign = pynini.closure(pynini.cross("negative: \"true\"", "-"), 0, 1) + unit = ( + pynutil.delete("units:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + + delete_space + ) + graph_decimal = ( + pynutil.delete("decimal {") + + delete_space + + optional_sign + + delete_space + + decimal.numbers + + delete_space + + pynutil.delete("}") + ) + graph_cardinal_first = ( + pynutil.delete("cardinal {") + + delete_space + + optional_sign + + delete_space + + cardinal.numbers + + delete_space + + pynutil.delete("}") + ) + graph_cardinal_two = ( + pynutil.delete("cardinal {") + + pynutil.delete(" integer: \"") + + delete_space + + optional_sign + + delete_space + + pynini.closure(NEMO_CHAR - " ", 1) + + pynutil.delete("\"") + + delete_space + + pynutil.delete("} ") + ) + graph_first = (graph_cardinal_first | graph_decimal) + delete_space + pynutil.insert(" ") + unit + graph_second = graph_cardinal_two + delete_space + pynutil.insert(" ") + unit + graph = graph_first | graph_second + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/money.py b/nemo_text_processing/text_normalization/hy/verbalizers/money.py new file mode 100644 index 000000000..f734278ca --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/verbalizers/money.py @@ -0,0 +1,39 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + + +class MoneyFst(GraphFst): + """ + Finite state transducer for verbalizing money, e.g. + money { "երկու դոլար"" } -> երկու դոլար" + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="money", kind="verbalize", deterministic=deterministic) + + delete_tokens = self.delete_tokens( + pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/hy/verbalizers/ordinal.py new file mode 100644 index 000000000..73ed4c918 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/verbalizers/ordinal.py @@ -0,0 +1,36 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for verbalizing ordinal, e.g. + ordinal { integer: "տասնչորս" } } -> տասնչորսերորդ + + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) + + graph = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + suffix = pynini.cdrewrite(pynini.cross("ըերորդ", "ներորդ"), "", "[EOS]", NEMO_SIGMA).optimize() + self.graph = (pynini.compose(graph, suffix)).optimize() + delete_tokens = self.delete_tokens(self.graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/time.py b/nemo_text_processing/text_normalization/hy/verbalizers/time.py new file mode 100644 index 000000000..94f9f43dc --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/verbalizers/time.py @@ -0,0 +1,42 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class TimeFst(GraphFst): + """ + Finite state transducer for verbalizing time + e.g. time { hours: "ինն" minutes: "քսաներկու" } -> ինն անց քսաներկու + + """ + + def __init__(self): + super().__init__(name="time", kind="verbalize") + numerator = pynutil.delete("hours: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + + denominator = ( + pynutil.insert(' անց ') + + pynutil.delete("minutes: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + graph = (numerator + delete_space + denominator).optimize() + self.numbers = graph.optimize() + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hy/verbalizers/verbalize.py new file mode 100644 index 000000000..810b1af49 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/verbalizers/verbalize.py @@ -0,0 +1,60 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.hy.verbalizers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.hy.verbalizers.decimal import DecimalFst +from nemo_text_processing.text_normalization.hy.verbalizers.fraction import FractionFst +from nemo_text_processing.text_normalization.hy.verbalizers.measure import MeasureFst +from nemo_text_processing.text_normalization.hy.verbalizers.money import MoneyFst +from nemo_text_processing.text_normalization.hy.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.hy.verbalizers.time import TimeFst +from nemo_text_processing.text_normalization.hy.verbalizers.whitelist import WhiteListFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic=True): + super().__init__(name="verbalize", kind="verbalize") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + ordinal_graph = OrdinalFst().fst + decimal = DecimalFst() + decimal_graph = decimal.fst + fraction = FractionFst() + fraction_graph = fraction.fst + measure_graph = MeasureFst(decimal=decimal, cardinal=cardinal).fst + money_graph = MoneyFst().fst + time_graph = TimeFst().fst + whitelist_graph = WhiteListFst().fst + graph = ( + time_graph + | fraction_graph + | measure_graph + | money_graph + | ordinal_graph + | decimal_graph + | cardinal_graph + | whitelist_graph + ) + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/hy/verbalizers/verbalize_final.py new file mode 100644 index 000000000..aebadd456 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/verbalizers/verbalize_final.py @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space +from nemo_text_processing.text_normalization.hy.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.text_normalization.hy.verbalizers.word import WordFst + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "Երևանում" } tokens { name: "ժամը" } tokens { time { hours: "տասներկուսն" minutes: "հիսունհինգ" } } tokens { name: "է" } tokens { name: ":" } -> Երևանում ժամը տասներկուսն անց հիսունհինգ է: + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, deterministic=True, cache_dir=None, overwrite_cache=False): + super().__init__(name="verbalize_final", kind="verbalize") + verbalize = VerbalizeFst().fst + word = WordFst().fst + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + self.fst = graph diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/hy/verbalizers/whitelist.py new file mode 100644 index 000000000..7d5783688 --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/verbalizers/whitelist.py @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + + +class WhiteListFst(GraphFst): + def __init__(self): + super().__init__(name="whitelist", kind="verbalize") + graph = ( + pynutil.delete("name:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_CHAR - " ", 1) + + pynutil.delete("\"") + ) + graph = graph @ pynini.cdrewrite( + pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA + ) # Removes possible null token + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/word.py b/nemo_text_processing/text_normalization/hy/verbalizers/word.py new file mode 100644 index 000000000..b0174d35e --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/verbalizers/word.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + + +class WordFst(GraphFst): + """ + Finite state transducer for verbalizing word + e.g. tokens { name: "արթնանալ" } -> արթնանալ + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self): + super().__init__(name="word", kind="verbalize") + chars = pynini.closure(NEMO_CHAR - " ", 1) + char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 5093eceef..bccb5ca06 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -162,6 +162,9 @@ def __init__( elif lang == 'it': from nemo_text_processing.text_normalization.it.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.it.verbalizers.verbalize_final import VerbalizeFinalFst + elif lang == 'hy': + from nemo_text_processing.text_normalization.hy.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.text_normalization.hy.verbalizers.verbalize_final import VerbalizeFinalFst else: raise NotImplementedError(f"Language {lang} has not been supported yet.") @@ -708,7 +711,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it"], + choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it", "hy"], default="en", type=str, ) diff --git a/nemo_text_processing/text_normalization/run_evaluate.py b/nemo_text_processing/text_normalization/run_evaluate.py index f64771265..5602a2985 100644 --- a/nemo_text_processing/text_normalization/run_evaluate.py +++ b/nemo_text_processing/text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh'], + choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy'], default="en", type=str, ) diff --git a/tests/nemo_text_processing/hy/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..eb52ea310 --- /dev/null +++ b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,12 @@ +0~զրո +6~վեց +91~իննսունմեկ +10~տասը +13~տասներեք +83~ութսուներեք +100~հարյուր +175~հարյուր յոթանասունհինգ +312247~երեք հարյուր տասներկու հազար երկու հարյուր քառասունյոթ +1000000~մեկ միլիոն +1892004~մեկ միլիոն ութ հարյուր իննսուներկու հազար չորս +8555~ութ հազար հինգ հարյուր հիսունհինգ diff --git a/tests/nemo_text_processing/hy/data_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_decimal.txt new file mode 100644 index 000000000..d2a50b20d --- /dev/null +++ b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_decimal.txt @@ -0,0 +1,9 @@ +0.2 միլիարդ~զրո ամբողջ երկու միլիարդ +30.2 միլիարդ~երեսուն ամբողջ երկու միլիարդ +390.3 միլիոն~երեք հարյուր իննսուն ամբողջ երեք միլիոն +10 միլիարդ~տասը միլիարդ +818303~ութ հարյուր տասնութ հազար երեք հարյուր երեք +24313~քսանչորս հազար երեք հարյուր տասներեք +5988005~հինգ միլիոն ինը հարյուր ութսունութ հազար հինգ +13~տասներեք +8900~ութ հազար ինը հարյուր diff --git a/tests/nemo_text_processing/hy/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_fraction.txt new file mode 100644 index 000000000..7415fceda --- /dev/null +++ b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_fraction.txt @@ -0,0 +1,10 @@ +1/2~մեկ երկրորդ +1/3~մեկ երրորդ +1/4~մեկ չորրորդ +50/85~հիսուն ութսունհինգերորդ +1/6~մեկ վեցերորդ +1/7~մեկ յոթերորդ +1/8~մեկ ութերորդ +2/9~երկու իններորդ +1/10~մեկ տասներորդ +59/3~հիսունինը երրորդ diff --git a/tests/nemo_text_processing/hy/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_measure.txt new file mode 100644 index 000000000..3774f1dd9 --- /dev/null +++ b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_measure.txt @@ -0,0 +1,9 @@ +200 մ~երկու հարյուր մետր +56.3 կմ²~հիսունվեց ամբողջ երեք քառակուսի կիլոմետր +100 կմ/ժ~հարյուր կիլոմետր ժամ +95 ր~իննսունհինգ րոպե +10 կգ~տասը կիլոգրամ +90 գ~իննսուն գրամ +300 սմ~երեք հարյուր սանտիմետր +65 կմ²~վաթսունհինգ քառակուսի կիլոմետր +50 դմ~հիսուն դեցիմետր diff --git a/tests/nemo_text_processing/hy/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_money.txt new file mode 100644 index 000000000..0da5951dc --- /dev/null +++ b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_money.txt @@ -0,0 +1,10 @@ +1 $~մեկ դոլար +1 ₽~մեկ ռուբլի +31 ₽~երեսունմեկ ռուբլի +16000 $~տասնվեց հազար դոլար +18000 $~տասնութ հազար դոլար +55000 ֏~հիսունհինգ հազար դրամ +2.5 միլիոն ₸~երկու ամբողջ հինգ միլիոն տենգե +3.2 միլիարդ ₾~երեք ամբողջ երկու միլիարդ լարի +90021 zł~իննսուն հազար քսանմեկ զլոտի +3 ₿~երեք բիթքոին diff --git a/tests/nemo_text_processing/hy/data_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_ordinal.txt new file mode 100644 index 000000000..7c59cfcc6 --- /dev/null +++ b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_ordinal.txt @@ -0,0 +1,11 @@ +1-ին~առաջին +3-րդ~երրորդ +8-րդ~ութերորդ +14-րդ~տասնչորսերորդ +6-րդ~վեցերորդ +5-րդ~հինգերորդ +9-րդ~իններորդ +4-րդ~չորրորդ +100-րդ~հարյուրերորդ +1000-րդ~հազարերորդ +2540-րդ~երկու հազար հինգ հարյուր քառասուներորդ diff --git a/tests/nemo_text_processing/hy/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..d48caae3a --- /dev/null +++ b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_time.txt @@ -0,0 +1,9 @@ +08:55~ութ անց հիսունհինգ +02:55~երկուսն անց հիսունհինգ +05:30~հինգ անց երեսուն +23:55~քսաներեք անց հիսունհինգ +01:30~մեկ անց երեսուն +10:45~տասն անց քառասունհինգ +05:50~հինգ անց հիսուն +02:22~երկուսն անց քսաներկու +07:30~յոթ անց երեսուն diff --git a/tests/nemo_text_processing/hy/data_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..3994e668d --- /dev/null +++ b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,14 @@ +ս.թ.~սույն թվականի +մ.թ.ա.~մեր թվարկությունից առաջ +մ.թ.~մեր թվարկություն +Ք.ա.~քրիստոսից առաջ +Ք.հ.~քրիստոսից հետո +Ք.ծ.ա.~քրիստոսի ծննդից առաջ +Ք.ծ.հ.~քրիստոսի ծննդից հետո +ս․թ․~սույն թվականի +մ․թ․ա․~մեր թվարկությունից առաջ +մ․թ․~մեր թվարկություն +Ք․ա․~քրիստոսից առաջ +Ք․հ․~քրիստոսից հետո +Ք․ծ․ա․~քրիստոսի ծննդից առաջ +Ք․ծ․հ․~քրիստոսի ծննդից հետո diff --git a/tests/nemo_text_processing/hy/data_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_word.txt new file mode 100644 index 000000000..ae39d0200 --- /dev/null +++ b/tests/nemo_text_processing/hy/data_text_normalization/test_cases_word.txt @@ -0,0 +1,50 @@ +~ +yahoo!~yahoo! +20 !~քսան ! +սսսս 55~սսսս հիսունհինգ +x~x +—~— +aaa~aaa +aabach~aabach +aabenraa~aabenraa +aabye~aabye +aaccessed~aaccessed +aach~aach +aachen's~aachen's +aadri~aadri +aafia~aafia +aagaard~aagaard +aagadu~aagadu +aagard~aagard +aagathadi~aagathadi +aaghart's~aaghart's +aagnes~aagnes +aagomoni~aagomoni +aagon~aagon +aagoo~aagoo +aagot~aagot +aahar~aahar +aahh~aahh +aahperd~aahperd +aaibinterstate~aaibinterstate +aajab~aajab +aakasa~aakasa +aakervik~aakervik +aakirkeby~aakirkeby +aalam~aalam +aalbaek~aalbaek +aaldiu~aaldiu +aalem~aalem +a'ali~a'ali +aalilaassamthey~aalilaassamthey +aalin~aalin +aaliyan~aaliyan +aaliyan's~aaliyan's +aamadu~aamadu +aamara~aamara +aambala~aambala +aamera~aamera +aamer's~aamer's +aamina~aamina +aaminah~aaminah +aamjiwnaang~aamjiwnaang diff --git a/tests/nemo_text_processing/hy/test_cardinal.py b/tests/nemo_text_processing/hy/test_cardinal.py index 82a9d1f1d..74bd5170c 100644 --- a/tests/nemo_text_processing/hy/test_cardinal.py +++ b/tests/nemo_text_processing/hy/test_cardinal.py @@ -16,6 +16,7 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -30,3 +31,12 @@ class TestCardinal: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=True) assert pred == expected + + normalizer = Normalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='lower_cased') + + @parameterized.expand(parse_test_case_file('hy/data_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred in expected diff --git a/tests/nemo_text_processing/hy/test_decimal.py b/tests/nemo_text_processing/hy/test_decimal.py index 051b60e21..aaa65a0b7 100644 --- a/tests/nemo_text_processing/hy/test_decimal.py +++ b/tests/nemo_text_processing/hy/test_decimal.py @@ -16,11 +16,13 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file class TestDecimal: + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_decimal.txt')) @@ -29,3 +31,12 @@ class TestDecimal: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='lower_cased') + + @parameterized.expand(parse_test_case_file('hy/data_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hy/test_fraction.py b/tests/nemo_text_processing/hy/test_fraction.py index 28157274e..c9fcc7873 100644 --- a/tests/nemo_text_processing/hy/test_fraction.py +++ b/tests/nemo_text_processing/hy/test_fraction.py @@ -16,11 +16,13 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file class TestFraction: + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=True) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_fraction.txt')) @@ -29,3 +31,12 @@ class TestFraction: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=True) assert pred == expected + + normalizer = Normalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=True, input_case='lower_cased') + + @parameterized.expand(parse_test_case_file('hy/data_text_normalization/test_cases_fraction.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hy/test_measure.py b/tests/nemo_text_processing/hy/test_measure.py index ccf73cb63..9402523c7 100644 --- a/tests/nemo_text_processing/hy/test_measure.py +++ b/tests/nemo_text_processing/hy/test_measure.py @@ -16,11 +16,13 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file class TestMeasure: + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_measure.txt')) @@ -29,3 +31,12 @@ class TestMeasure: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='lower_cased') + + @parameterized.expand(parse_test_case_file('hy/data_text_normalization/test_cases_measure.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hy/test_money.py b/tests/nemo_text_processing/hy/test_money.py index 8d4189091..291ce764f 100644 --- a/tests/nemo_text_processing/hy/test_money.py +++ b/tests/nemo_text_processing/hy/test_money.py @@ -16,11 +16,13 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file class TestMoney: + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_money.txt')) @@ -29,3 +31,12 @@ class TestMoney: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='lower_cased') + + @parameterized.expand(parse_test_case_file('hy/data_text_normalization/test_cases_money.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hy/test_ordinal.py b/tests/nemo_text_processing/hy/test_ordinal.py index 52026d486..1e93f5f2e 100644 --- a/tests/nemo_text_processing/hy/test_ordinal.py +++ b/tests/nemo_text_processing/hy/test_ordinal.py @@ -16,11 +16,13 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file class TestOrdinal: + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=True) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_ordinal.txt')) @@ -29,3 +31,12 @@ class TestOrdinal: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=True) assert pred == expected + + normalizer = Normalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='lower_cased') + + @parameterized.expand(parse_test_case_file('hy/data_text_normalization/test_cases_ordinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hy/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/hy/test_sparrowhawk_normalization.sh new file mode 100755 index 000000000..15d211c0b --- /dev/null +++ b/tests/nemo_text_processing/hy/test_sparrowhawk_normalization.sh @@ -0,0 +1,69 @@ +#! /bin/sh + +PROJECT_DIR=/workspace/tests + +runtest () { + input=$1 + cd /workspace/sparrowhawk/documentation/grammars + + # read test file + while read testcase; do + IFS='~' read written spoken <<< $testcase + denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + + # trim white space + spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$written" "$spoken" "$denorm_pred" + done < "$input" +} + +testTNCardinal() { + input=$PROJECT_DIR/hy/data_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testTNDecimal() { + input=$PROJECT_DIR/hy/data_text_normalization/test_cases_decimal.txt + runtest $input +} + +testTNFraction() { + input=$PROJECT_DIR/hy/data_text_normalization/test_cases_fraction.txt + runtest $input +} + +testTNOrdinal() { + input=$PROJECT_DIR/hy/data_text_normalization/test_cases_ordinal.txt + runtest $input +} + +testTNTime() { + input=$PROJECT_DIR/hy/data_text_normalization/test_cases_time.txt + runtest $input +} + +testTNMeasure() { + input=$PROJECT_DIR/hy/data_text_normalization/test_cases_measure.txt + runtest $input +} + +testTNWhitelist() { + input=$PROJECT_DIR/hy/data_text_normalization/test_cases_whitelist.txt + runtest $input +} + +testTNWord() { + input=$PROJECT_DIR/hy/data_text_normalization/test_cases_word.txt + runtest $input +} + +testTNMoney() { + input=$PROJECT_DIR/hy/data_text_normalization/test_cases_money.txt + runtest $input +} + +# Load shUnit2 +. $PROJECT_DIR/../shunit2/shunit2 \ No newline at end of file diff --git a/tests/nemo_text_processing/hy/test_time.py b/tests/nemo_text_processing/hy/test_time.py index 7b9df98b5..6c0f72537 100644 --- a/tests/nemo_text_processing/hy/test_time.py +++ b/tests/nemo_text_processing/hy/test_time.py @@ -16,11 +16,13 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file class TestTime: + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_time.txt')) @@ -29,3 +31,12 @@ class TestTime: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='lower_cased') + + @parameterized.expand(parse_test_case_file('hy/data_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hy/test_whitelist.py b/tests/nemo_text_processing/hy/test_whitelist.py index 59b31483f..75562cf9f 100644 --- a/tests/nemo_text_processing/hy/test_whitelist.py +++ b/tests/nemo_text_processing/hy/test_whitelist.py @@ -16,11 +16,13 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file class TestWhitelist: + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_whitelist.txt')) @@ -29,3 +31,12 @@ class TestWhitelist: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='lower_cased') + + @parameterized.expand(parse_test_case_file('hy/data_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hy/test_word.py b/tests/nemo_text_processing/hy/test_word.py index 69a34456d..30f7274b1 100644 --- a/tests/nemo_text_processing/hy/test_word.py +++ b/tests/nemo_text_processing/hy/test_word.py @@ -16,11 +16,13 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file class TestWord: + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_word.txt')) @@ -29,3 +31,12 @@ class TestWord: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='lower_cased') + + @parameterized.expand(parse_test_case_file('hy/data_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index aa3207e3e..22b455236 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -235,6 +235,10 @@ def parse_args(): from nemo_text_processing.inverse_text_normalization.hy.verbalizers.verbalize import ( VerbalizeFst as ITNVerbalizeFst, ) + from nemo_text_processing.text_normalization.hy.taggers.tokenize_and_classify import ( + ClassifyFst as TNClassifyFst, + ) + from nemo_text_processing.text_normalization.hy.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") export_grammars( output_dir=output_dir, From 462d551c4efac3cbeeb86532c0ec7a9f1d38a12b Mon Sep 17 00:00:00 2001 From: Chinmay Patil <72211393+ChinmayPatil11@users.noreply.github.com> Date: Wed, 13 Mar 2024 22:49:29 +0530 Subject: [PATCH 11/90] Marathi ITN (#134) * Added Marathi ITN Signed-off-by: Chinmay Patil * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * adding jenkins test Signed-off-by: Travis Bartley --------- Signed-off-by: Chinmay Patil Signed-off-by: tbartley94 <90423858+tbartley94@users.noreply.github.com> Signed-off-by: Travis Bartley Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: tbartley94 <90423858+tbartley94@users.noreply.github.com> Co-authored-by: Travis Bartley Signed-off-by: Alex Cui --- Jenkinsfile | 39 ++++ .../inverse_normalize.py | 7 +- .../inverse_text_normalization/mr/__init__.py | 17 ++ .../mr/data/__init__.py | 13 ++ .../mr/data/date/__init__.py | 13 ++ .../mr/data/date/dates.tsv | 31 +++ .../mr/data/date/months.tsv | 12 ++ .../mr/data/date/prefixes.tsv | 2 + .../mr/data/numbers/__init__.py | 13 ++ .../mr/data/numbers/digits.tsv | 9 + .../mr/data/numbers/hundred.tsv | 1 + .../mr/data/numbers/tens.tsv | 90 ++++++++ .../mr/data/numbers/thousands.tsv | 4 + .../mr/data/numbers/zero.tsv | 1 + .../mr/data/time/__init__.py | 13 ++ .../mr/data/time/hours.tsv | 12 ++ .../mr/data/time/hours_to.tsv | 12 ++ .../mr/data/time/minutes.tsv | 60 ++++++ .../mr/data/time/minutes_to.tsv | 59 ++++++ .../mr/graph_utils.py | 194 ++++++++++++++++++ .../mr/taggers/__init__.py | 13 ++ .../mr/taggers/cardinal.py | 109 ++++++++++ .../mr/taggers/date.py | 61 ++++++ .../mr/taggers/decimal.py | 98 +++++++++ .../mr/taggers/punctuation.py | 36 ++++ .../mr/taggers/time.py | 117 +++++++++++ .../mr/taggers/tokenize_and_classify.py | 97 +++++++++ .../mr/taggers/word.py | 31 +++ .../inverse_text_normalization/mr/utils.py | 60 ++++++ .../mr/verbalizers/__init__.py | 13 ++ .../mr/verbalizers/cardinal.py | 53 +++++ .../mr/verbalizers/date.py | 92 +++++++++ .../mr/verbalizers/decimal.py | 62 ++++++ .../mr/verbalizers/time.py | 55 +++++ .../mr/verbalizers/verbalize.py | 37 ++++ .../mr/verbalizers/verbalize_final.py | 45 ++++ .../mr/verbalizers/word.py | 39 ++++ tests/nemo_text_processing/mr/__init__.py | 13 ++ .../test_cases_cardinal.txt | 33 +++ .../test_cases_date.txt | 29 +++ .../test_cases_decimal.txt | 29 +++ .../test_cases_time.txt | 29 +++ .../test_cases_word.txt | 11 + .../nemo_text_processing/mr/test_cardinal.py | 31 +++ tests/nemo_text_processing/mr/test_date.py | 31 +++ tests/nemo_text_processing/mr/test_decimal.py | 31 +++ ..._sparrowhawk_inverse_text_normalization.sh | 49 +++++ tests/nemo_text_processing/mr/test_time.py | 31 +++ tests/nemo_text_processing/mr/test_word.py | 32 +++ .../export_grammars.sh | 2 +- .../pynini_export.py | 11 +- 51 files changed, 1978 insertions(+), 4 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/mr/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/data/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/data/date/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/data/date/dates.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/mr/data/date/months.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/mr/data/date/prefixes.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/mr/data/numbers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/data/numbers/digits.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/mr/data/numbers/hundred.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/mr/data/numbers/tens.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/mr/data/numbers/thousands.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/mr/data/numbers/zero.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/mr/data/time/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/data/time/hours.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/mr/data/time/hours_to.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/mr/data/time/minutes.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/mr/data/time/minutes_to.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/mr/graph_utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/taggers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/taggers/date.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/taggers/punctuation.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/taggers/time.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/taggers/tokenize_and_classify.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/taggers/word.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/verbalizers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/verbalizers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/verbalizers/date.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/verbalizers/decimal.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/verbalizers/time.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/verbalizers/verbalize.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/verbalizers/verbalize_final.py create mode 100644 nemo_text_processing/inverse_text_normalization/mr/verbalizers/word.py create mode 100644 tests/nemo_text_processing/mr/__init__.py create mode 100644 tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_cardinal.txt create mode 100644 tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_date.txt create mode 100644 tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_decimal.txt create mode 100644 tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_time.txt create mode 100644 tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_word.txt create mode 100644 tests/nemo_text_processing/mr/test_cardinal.py create mode 100644 tests/nemo_text_processing/mr/test_date.py create mode 100644 tests/nemo_text_processing/mr/test_decimal.py create mode 100644 tests/nemo_text_processing/mr/test_sparrowhawk_inverse_text_normalization.sh create mode 100644 tests/nemo_text_processing/mr/test_time.py create mode 100644 tests/nemo_text_processing/mr/test_word.py diff --git a/Jenkinsfile b/Jenkinsfile index 0bc046399..d671a53c0 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -24,6 +24,8 @@ pipeline { SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-27-23-0' IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-26-23-0' + HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' + MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { @@ -239,6 +241,33 @@ pipeline { } } + stage('L0: Create HY TN/ITN Grammars & MR') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('L0: MR ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=mr --text="शून्य" --cache_dir ${MR_ITN_CACHE}' + } + } + stage('L0: HY TN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hy --text="6" --cache_dir ${HY_TN_CACHE}' + } + } + stage('L0: HY ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hy --text="վեց" --cache_dir ${HY_TN_CACHE}' + } + } + } + } + // L1 Tests starts here stage('L1: TN/ITN Tests CPU') { @@ -305,6 +334,16 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/zh/ -m "not pleasefixme" --cpu --tn_cache_dir ${ZH_TN_CACHE}' } } + stage('L1: Run all MR ITN tests (restore grammars from cache)') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/mr/ -m "not pleasefixme" --cpu --tn_cache_dir ${MR_TN_CACHE}' + } + } + stage('L1: Run all HY TN/ITN tests (restore grammars from cache)') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/hy/ -m "not pleasefixme" --cpu --tn_cache_dir ${HY_TN_CACHE}' + } + } } } diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index b13fe2c65..b2f842822 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -111,6 +111,11 @@ def __init__( from nemo_text_processing.inverse_text_normalization.zh.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) + elif lang == 'mr': # Marathi + from nemo_text_processing.inverse_text_normalization.mr.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.inverse_text_normalization.mr.verbalizers.verbalize_final import ( + VerbalizeFinalFst, + ) elif lang == 'hy': from nemo_text_processing.inverse_text_normalization.hy.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.hy.verbalizers.verbalize_final import ( @@ -160,7 +165,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hy'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hy', 'mr'], default="en", type=str, ) diff --git a/nemo_text_processing/inverse_text_normalization/mr/__init__.py b/nemo_text_processing/inverse_text_normalization/mr/__init__.py new file mode 100644 index 000000000..9f70e3146 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.mr.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.mr.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.mr.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/mr/data/__init__.py b/nemo_text_processing/inverse_text_normalization/mr/data/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/mr/data/date/__init__.py b/nemo_text_processing/inverse_text_normalization/mr/data/date/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/data/date/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/mr/data/date/dates.tsv b/nemo_text_processing/inverse_text_normalization/mr/data/date/dates.tsv new file mode 100644 index 000000000..b77c7dec4 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/data/date/dates.tsv @@ -0,0 +1,31 @@ +१ एक +२ दोन +३ तीन +४ चार +५ पाच +६ सहा +७ सात +८ आठ +९ नऊ +१० दहा +११ अकरा +१२ बारा +१३ तेरा +१४ चौदा +१५ पंधरा +१६ सोळा +१७ सतरा +१८ अठरा +१९ एकोणीस +२० वीस +२१ एकवीस +२२ बावीस +२३ तेवीस +२४ चोवीस +२५ पंचवीस +२६ सव्वीस +२७ सत्तावीस +२८ अठ्ठावीस +२९ एकोणतीस +३० तीस +३१ एकतीस \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/mr/data/date/months.tsv b/nemo_text_processing/inverse_text_normalization/mr/data/date/months.tsv new file mode 100644 index 000000000..f39b90d2c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/data/date/months.tsv @@ -0,0 +1,12 @@ +जानेवारी +फेब्रुवारी +मार्च +एप्रिल +मे +जून +जुलै +ऑगस्ट +सप्टेंबर +ऑक्टोबर +नोव्हेंबर +डिसेंबर \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/mr/data/date/prefixes.tsv b/nemo_text_processing/inverse_text_normalization/mr/data/date/prefixes.tsv new file mode 100644 index 000000000..dce4c03b5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/data/date/prefixes.tsv @@ -0,0 +1,2 @@ +इसवी सन पूर्व इ.स.पू. +इसवी सन इ.स. \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/mr/data/numbers/__init__.py b/nemo_text_processing/inverse_text_normalization/mr/data/numbers/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/data/numbers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/mr/data/numbers/digits.tsv b/nemo_text_processing/inverse_text_normalization/mr/data/numbers/digits.tsv new file mode 100644 index 000000000..f737ade2c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/data/numbers/digits.tsv @@ -0,0 +1,9 @@ +१ एक +२ दोन +३ तीन +४ चार +५ पाच +६ सहा +७ सात +८ आठ +९ नऊ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/mr/data/numbers/hundred.tsv b/nemo_text_processing/inverse_text_normalization/mr/data/numbers/hundred.tsv new file mode 100644 index 000000000..5b05d55e9 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/data/numbers/hundred.tsv @@ -0,0 +1 @@ +१०० शंभर \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/mr/data/numbers/tens.tsv b/nemo_text_processing/inverse_text_normalization/mr/data/numbers/tens.tsv new file mode 100644 index 000000000..81107a31a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/data/numbers/tens.tsv @@ -0,0 +1,90 @@ +१० दहा +११ अकरा +१२ बारा +१३ तेरा +१४ चौदा +१५ पंधरा +१६ सोळा +१७ सतरा +१८ अठरा +१९ एकोणीस +२० वीस +२१ एकवीस +२२ बावीस +२३ तेवीस +२४ चोवीस +२५ पंचवीस +२६ सव्वीस +२७ सत्तावीस +२८ अठ्ठावीस +२९ एकोणतीस +३० तीस +३१ एकतीस +३२ बत्तीस +३३ तेहतीस +३४ चौतीस +३५ पस्तीस +३६ छत्तीस +३७ सदतीस +३८ अडतीस +३९ एकोणचाळीस +४० चाळीस +४१ एकेचाळीस +४२ बेचाळीस +४३ त्रेचाळीस +४४ चव्वेचाळीस +४५ पंचेचाळीस +४६ सेहचाळीस +४७ सत्तेचाळीस +४८ अठ्ठेचाळीस +४९ एकोणपन्नास +५० पन्नास +५१ एकावन्न +५२ बावन्न +५३ त्रेपन्न +५४ चौपन्न +५५ पंचावन्न +५६ छप्पन +५७ सत्तावन्न +५८ अठ्ठावन्न +५९ एकोणसाठ +६० साठ +६१ एकसष्ट +६२ बासष्ट +६३ त्रेसष्ट +६४ चौसष्ट +६५ पासष्ट +६६ सहासष्ठ +६७ सदुसष्ट +६८ अडुसष्ठ +६९ एकोणसत्तर +७० सत्तर +७१ एकाहत्तर +७२ बाहत्तर +७३ त्र्याहत्तर +७४ चौऱ्याहत्तर +७५ पंचाहत्तर +७६ शहात्तर +७७ सत्याहत्तर +७८ अठ्ठ्यात्तर +७९ एकोणऐंशी +८० ऐंशी +८१ एक्याऐंशी +८२ ब्याऐंशी +८३ त्र्याऐंशी +८४ चौऱ्याऐंशी +८५ पंच्याऐंशी +८६ सह्यांशी +८७ सत्याऐंशी +८८ अठ्ठ्याऐंशी +८९ एकोणनव्वद +९० नव्वद +९१ एक्याण्णव +९२ ब्याण्णव +९३ त्र्याण्णव +९४ चौऱ्याण्णव +९५ पंच्याण्णव +९६ शह्याण्णवx +९७ सत्त्याण्णव +९८ अठ्ठ्याण्णव +९९ नव्व्याण्णव \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/mr/data/numbers/thousands.tsv b/nemo_text_processing/inverse_text_normalization/mr/data/numbers/thousands.tsv new file mode 100644 index 000000000..f9f09996c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/data/numbers/thousands.tsv @@ -0,0 +1,4 @@ +हजार +लाख +कोटी +अब्ज \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/mr/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/mr/data/numbers/zero.tsv new file mode 100644 index 000000000..727ccdf16 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/data/numbers/zero.tsv @@ -0,0 +1 @@ +० शून्य \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/mr/data/time/__init__.py b/nemo_text_processing/inverse_text_normalization/mr/data/time/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/data/time/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/mr/data/time/hours.tsv b/nemo_text_processing/inverse_text_normalization/mr/data/time/hours.tsv new file mode 100644 index 000000000..b89645ec9 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/data/time/hours.tsv @@ -0,0 +1,12 @@ +एक १ +दोन २ +तीन ३ +चार ४ +पाच ५ +सहा ६ +सात ७ +आठ ८ +नऊ ९ +दहा १० +अकरा ११ +बारा १२ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/mr/data/time/hours_to.tsv b/nemo_text_processing/inverse_text_normalization/mr/data/time/hours_to.tsv new file mode 100644 index 000000000..34d69d331 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/data/time/hours_to.tsv @@ -0,0 +1,12 @@ +एक १२ +दोन १ +तीन २ +चार ३ +पाच ४ +सहा ५ +सात ६ +आठ ७ +नऊ ८ +दहा ९ +अकरा १० +बारा ११ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/mr/data/time/minutes.tsv b/nemo_text_processing/inverse_text_normalization/mr/data/time/minutes.tsv new file mode 100644 index 000000000..e9ae86c0b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/data/time/minutes.tsv @@ -0,0 +1,60 @@ +एक १ +दोन २ +तीन ३ +चार ४ +पाच ५ +सहा ६ +सात ७ +आठ ८ +नऊ ९ +दहा १० +अकरा ११ +बारा १२ +तेरा १३ +चौदा १४ +पंधरा १५ +सोळा १६ +सतरा १७ +अठरा १८ +एकोणीस १९ +वीस २० +एकवीस २१ +बावीस २२ +तेवीस २३ +चोवीस २४ +पंचवीस २५ +सव्वीस २६ +सत्तावीस २७ +अठ्ठावीस २८ +एकोणतीस २९ +तीस ३० +एकतीस ३१ +बत्तीस ३२ +तेहतीस ३३ +चौतीस ३४ +पस्तीस ३५ +छत्तीस ३६ +सदतीस ३७ +अडतीस ३८ +एकोणचाळीस ३९ +चाळीस ४० +एकेचाळीस ४१ +बेचाळीस ४२ +त्रेचाळीस ४३ +चव्वेचाळीस ४४ +पंचेचाळीस ४५ +सेहेचाळीस ४६ +सत्तेचाळीस ४७ +अठ्ठेचाळीस ४८ +एकोणपन्नास ४९ +पन्नास ५० +एकावन्न ५१ +बावन्न ५२ +त्रेपन्न ५३ +चौपन्न ५४ +पंचावन्न ५५ +छप्पन ५६ +सत्तावन्न ५७ +अठ्ठावन्न ५८ +एकोणसाठ ५९ +साठ ६० \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/mr/data/time/minutes_to.tsv b/nemo_text_processing/inverse_text_normalization/mr/data/time/minutes_to.tsv new file mode 100644 index 000000000..01c63cc76 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/data/time/minutes_to.tsv @@ -0,0 +1,59 @@ +एक ५९ +दोन ५८ +तीन ५७ +चार ५६ +पाच ५५ +सहा ५४ +सात ५३ +आठ ५२ +नऊ ५१ +दहा ५० +अकरा ४९ +बारा ४८ +तेरा ४७ +चौदा ४६ +पंधरा ४५ +सोळा ४४ +सतरा ४३ +अठरा ४२ +एकोणीस ४१ +वीस ४० +एकवीस ३९ +बावीस ३८ +तेवीस ३७ +चोवीस ३६ +पंचवीस ३५ +सव्वीस ३४ +सत्तावीस ३३ +अठ्ठावीस ३२ +एकोणतीस ३२ +तीस ३० +एकतीस २९ +बत्तीस २८ +तेहतीस २७ +चौतीस २६ +पस्तीस २५ +छत्तीस २४ +सदतीस २३ +अडतीस २२ +एकोणचाळीस २१ +चाळीस २० +एकेचाळीस १९ +बेचाळीस १८ +त्रेचाळीस १७ +चव्वेचाळीस १६ +पंचेचाळीस १५ +सेहचाळीस १४ +सत्तेचाळीस १३ +अठ्ठेचाळीस १२ +एकोणपन्नास ११ +पन्नास १० +एकावन्न ९ +बावन्न ८ +त्रेपन्न ७ +चौपन्न ६ +पंचावन्न ५ +छप्पन ४ +सत्तावन्न ३ +अठ्ठावन्न २ +एकोणसाठ १ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/mr/graph_utils.py b/nemo_text_processing/inverse_text_normalization/mr/graph_utils.py new file mode 100644 index 000000000..9e6276813 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/graph_utils.py @@ -0,0 +1,194 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.export import export +from pynini.lib import pynutil, utf8 + +from nemo_text_processing.inverse_text_normalization.mr.utils import load_labels + +NEMO_CHAR = utf8.VALID_UTF8_CHAR +NEMO_MARATHI_DIGITS = ( + "\u0966" + "\u0967" + "\u0968" + "\u0969" + "\u096A" + "\u096B" + "\u096C" + "\u096D" + "\u096E" + "\u096F" +) +NEMO_DIGIT = pynini.union(*NEMO_MARATHI_DIGITS).optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_NON_BREAKING_SPACE = u"\u00A0" +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_CHAR, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"")) +) + +MIN_NEG_WEIGHT = -0.0001 +MIN_POS_WEIGHT = 0.0001 +INPUT_CASED = "cased" +INPUT_LOWER_CASED = "lower_cased" +MINUS = pynini.union("उणे").optimize() + + +def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logging.info(f'Created {file_name}') + + +def convert_space(fst) -> 'pynini.FstLike': + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) + + +def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): + labels = load_labels(input_file) + + if input_case == INPUT_CASED: + additional_labels = [] + for written, spoken, *weight in labels: + written_capitalized = written[0].upper() + written[1:] + additional_labels.extend( + [ + [written_capitalized, spoken.capitalize()], # first letter capitalized + [ + written_capitalized, + spoken.upper().replace(" AND ", " and "), + ], # # add pairs with the all letters capitalized + ] + ) + + spoken_no_space = spoken.replace(" ", "") + # add abbreviations without spaces (both lower and upper case), i.e. "BMW" not "B M W" + if len(spoken) == (2 * len(spoken_no_space) - 1): + logging.debug(f"This is weight {weight}") + if len(weight) == 0: + additional_labels.extend( + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]] + ) + else: + additional_labels.extend( + [ + [written, spoken_no_space, weight[0]], + [written_capitalized, spoken_no_space.upper(), weight[0]], + ] + ) + labels += additional_labels + + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far') + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> 'pynini.FstLike': + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> 'pynini.FstLike': + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> 'pynini.FstLike': + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py new file mode 100644 index 000000000..27d0a35c5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py @@ -0,0 +1,109 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.mr.graph_utils import ( + MINUS, + NEMO_DIGIT, + NEMO_SPACE, + GraphFst, + delete_space, +) +from nemo_text_processing.inverse_text_normalization.mr.utils import get_abs_path + + +class CardinalFst(GraphFst): + """ + Finite state transducer for classifying cardinals + e.g. तेहतीस -> cardinal { integer: "३३" } + """ + + def __init__(self): + super().__init__(name="cardinal", kind="classify") + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() + graph_digits = pynini.string_file(get_abs_path("data/numbers/digits.tsv")).invert() + graph_tens = pynini.string_file(get_abs_path("data/numbers/tens.tsv")).invert() + graph_hundred_unique = pynini.string_file(get_abs_path("data/numbers/hundred.tsv")).invert() + + graph_hundred = pynini.cross("शे", "") + + graph_hundred_component = pynini.union(graph_digits + graph_hundred, pynutil.insert("०")) + graph_hundred_component += delete_space + graph_hundred_component += pynini.union(pynutil.insert("००"), graph_tens, pynutil.insert("०") + graph_digits) + + graph_hundred_component_at_least_one_non_zero_digit = graph_hundred_component @ ( + pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "०") + pynini.closure(NEMO_DIGIT) + ) + self.graph_hundred_component_at_least_one_non_zero_digit = graph_hundred_component_at_least_one_non_zero_digit + + # eleven hundred -> 1100 etc form + graph_hundred_as_thousand = graph_tens + graph_hundred + graph_hundred_as_thousand += delete_space + pynini.union( + pynutil.insert("००"), graph_tens, pynutil.insert("०") + graph_digits + ) + + graph_hundreds = graph_hundred_component | graph_hundred_as_thousand + + graph_two_digit_component = pynini.union(pynutil.insert("००"), graph_tens, pynutil.insert("०") + graph_digits) + + graph_two_digit_component_at_least_one_non_zero_digit = graph_two_digit_component @ ( + pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "०") + pynini.closure(NEMO_DIGIT) + ) + self.graph_two_digit_component_at_least_one_non_zero_digit = ( + graph_two_digit_component_at_least_one_non_zero_digit + ) + + graph_thousands = pynini.union( + graph_two_digit_component_at_least_one_non_zero_digit + delete_space + pynutil.delete("हजार"), + pynutil.insert("००", weight=0.1), + ) + + graph_lakhs = pynini.union( + graph_two_digit_component_at_least_one_non_zero_digit + delete_space + pynutil.delete("लाख"), + pynutil.insert("००", weight=0.1), + ) + + graph_crores = pynini.union( + graph_two_digit_component_at_least_one_non_zero_digit + delete_space + pynutil.delete("कोटी"), + pynutil.insert("००", weight=0.1), + ) + + graph_arabs = pynini.union( + graph_two_digit_component_at_least_one_non_zero_digit + delete_space + pynutil.delete("अब्ज"), + pynutil.insert("००", weight=0.1), + ) + + graph_higher_powers = ( + graph_arabs + delete_space + graph_crores + delete_space + graph_lakhs + delete_space + graph_thousands + ) + + graph = pynini.union(graph_higher_powers + delete_space + graph_hundreds, graph_hundred_unique, graph_zero,) + + graph = graph @ pynini.union( + pynutil.delete(pynini.closure("०")) + pynini.difference(NEMO_DIGIT, "०") + pynini.closure(NEMO_DIGIT), "०" + ) + graph = graph.optimize() + + self.graph = (pynini.project(graph, "input")) @ graph + + optional_minus_graph = pynini.closure( + pynutil.insert("negative: ") + pynini.cross(MINUS, "\"-\"") + NEMO_SPACE, 0, 1 + ) + + final_graph = optional_minus_graph + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py new file mode 100644 index 000000000..96e8fb08d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py @@ -0,0 +1,61 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.mr.graph_utils import GraphFst, delete_extra_space, delete_space +from nemo_text_processing.inverse_text_normalization.mr.utils import get_abs_path + + +class DateFst(GraphFst): + """ + Finite State Transducer for classifying dates + e.g. दहा जानेवारी दोन हजार -> date { day: "१०" month: "जानेवारी" year: "२०००" preserve_order: true } + e.g. इसवी सन दोन हजार बावीस -> date { text: "इ.स." year: "२०२२" preserve_order: true } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name='date', kind="classify") + months = pynini.string_file(get_abs_path("data/date/months.tsv")) + dates = pynini.string_file(get_abs_path("data/date/dates.tsv")).invert() + prefixes = pynini.string_file(get_abs_path("data/date/prefixes.tsv")) + + YEAR_WEIGHT = 0.001 + month_graph = pynutil.insert("month: \"") + months + pynutil.insert("\" ") + day_graph = pynutil.insert("day: \"") + dates + pynutil.insert("\" ") + year_graph = cardinal.graph + graph_year = ( + delete_extra_space + + pynutil.insert("year: \"") + + pynutil.add_weight(year_graph, -YEAR_WEIGHT) + + pynutil.insert("\"") + ) + optional_graph_year = pynini.closure(graph_year, 0, 1,) + graph_ad_bc = pynutil.insert("text: \"") + prefixes + delete_space + pynutil.insert("\"") + + graph_mdy = month_graph + ( + (delete_extra_space + day_graph) | graph_year | (delete_extra_space + day_graph + graph_year) + ) + graph_dmy = day_graph + delete_space + month_graph + optional_graph_year + graph_year_prefix = graph_ad_bc + graph_year + + final_graph = graph_mdy | graph_dmy | graph_year_prefix + final_graph += pynutil.insert(" preserve_order: true") + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py new file mode 100644 index 000000000..9434f77fe --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py @@ -0,0 +1,98 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.mr.graph_utils import ( + MINUS, + NEMO_DIGIT, + NEMO_SPACE, + GraphFst, + delete_extra_space, + delete_space, +) +from nemo_text_processing.inverse_text_normalization.mr.utils import get_abs_path, load_labels + + +def get_quantity(decimal, cardinal_fst): + numbers = cardinal_fst @ ( + pynutil.delete(pynini.closure("०")) + pynini.difference(NEMO_DIGIT, "०") + pynini.closure(NEMO_DIGIT) + ) + suffix_labels = load_labels(get_abs_path("/data/numbers/thousands.tsv")) + suffix_labels = [x[0] for x in suffix_labels if x[0] != "हजार"] + suffix = pynini.union(*suffix_labels).optimize() + + res = ( + pynutil.insert("integer_part: \"") + + numbers + + pynutil.insert("\"") + + delete_extra_space + + pynutil.insert("quantity: \"") + + suffix + + pynutil.insert("\"") + ) + res |= decimal + delete_extra_space + pynutil.insert("quantity: \"") + (suffix | "हजार") + pynutil.insert("\"") + + return res + + +class DecimalFst(GraphFst): + """ + Finite state transducer for classifying cardinals + e.g. तेहतीस पूर्णांक तीन -> decimal { integer_part: "३३" fractional_part: "३" } + e.g. उणे तेहतीस पूर्णांक तीन लाख -> decimal { negative: "true" integer_part: "३३" fractional_part: "३" quantity: "लाख" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="decimal", kind="classify") + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() + graph_digits = pynini.string_file(get_abs_path("data/numbers/digits.tsv")).invert() + decimal_word = pynini.cross("पूर्णांक", "") + optional_graph_negative = pynini.closure( + pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, 0, 1, + ) + graph_integer = ( + pynutil.insert("integer_part: \"") + + pynini.closure(cardinal.graph, 0, 1) + + pynutil.insert("\"") + + NEMO_SPACE + ) + graph_decimal = graph_integer + delete_space + decimal_word + + graph_fractional = ( + pynutil.insert("fractional_part: \"") + + pynini.closure(delete_space + (graph_zero | graph_digits), 1) + + pynutil.insert("\"") + ) + graph_decimal += graph_fractional + + final_graph_without_sign = graph_decimal + final_graph = optional_graph_negative + final_graph_without_sign + + self.final_graph_without_negative = final_graph_without_sign | get_quantity( + final_graph_without_sign, cardinal.graph_hundred_component_at_least_one_non_zero_digit + ) + + quantity_graph = get_quantity( + final_graph_without_sign, cardinal.graph_hundred_component_at_least_one_non_zero_digit + ) + final_graph |= optional_graph_negative + quantity_graph + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/punctuation.py new file mode 100644 index 000000000..97b5257a0 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/punctuation.py @@ -0,0 +1,36 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.mr.graph_utils import GraphFst + + +class PunctuationFst(GraphFst): + """ + Finite state transducer for classifying punctuation + e.g. a, -> tokens { name: "a" } tokens { name: "," } + """ + + def __init__(self): + super().__init__(name="punctuation", kind="classify") + + s = "!#$%&\'()*+,-./:;<=>?@^_`{|}~" + punct = pynini.union(*s) + + graph = pynutil.insert("name: \"") + punct + pynutil.insert("\"") + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/time.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/time.py new file mode 100644 index 000000000..c4b311e4b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/time.py @@ -0,0 +1,117 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.mr.graph_utils import GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.mr.utils import get_abs_path + + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time + e.g. साडे चार -> time { hours: "४" minutes: "३०" } + e.g. सव्वा बारा -> time { hours: "१२" minutes: "१५" } + e.g. पावणे दहा -> time { hours: "९" minutes: "४५" } + e.g. अकराला पाच मिनिटे -> time { hours: "१०" minutes: "५५" } + e.g. अकरा वाजून दोन मिनिटे -> time { hours: "११" minutes: "२" } + e.g. अडीच -> time { hours: "२" minutes: "३०" } + """ + + def __init__(self): + super().__init__(name="time", kind="classify") + hours = pynini.string_file(get_abs_path("data/time/hours.tsv")) + minutes = pynini.string_file(get_abs_path("data/time/minutes.tsv")) + hours_to = pynini.string_file(get_abs_path("data/time/hours_to.tsv")) + minutes_to = pynini.string_file(get_abs_path("data/time/minutes_to.tsv")) + + time_word = pynini.cross("वाजून", "") + minutes_word = pynini.cross("मिनिटे", "") | pynini.cross("मिनिट", "") + graph_time_full = ( + pynutil.insert("hours: \"") + + hours + + pynutil.insert("\"") + + delete_space + + time_word + + delete_space + + pynutil.insert(" ") + + pynutil.insert("minutes: \"") + + minutes + + pynutil.insert("\"") + + delete_space + + minutes_word + ) + graph_time_to = ( + pynutil.insert("hours: \"") + + hours_to + + pynutil.insert("\"") + + pynini.cross("ला", "") + + delete_space + + pynutil.insert(" ") + + pynutil.insert("minutes: \"") + + minutes_to + + pynutil.insert("\"") + + delete_space + + minutes_word + ) + + # special terms used for 15, 30 and 45 minutes + graph_fifteen = ( + pynini.cross("सव्वा", "") + + delete_space + + pynutil.insert("hours: \"") + + hours + + pynutil.insert("\"") + + pynutil.insert(" ") + + pynutil.insert("minutes: \"") + + pynutil.insert("१५") + + pynutil.insert("\"") + ) + graph_thirty = ( + pynini.cross("साडे", "") + + delete_space + + pynutil.insert("hours: \"") + + hours + + pynutil.insert("\"") + + pynutil.insert(" ") + + pynutil.insert("minutes: \"") + + pynutil.insert("३०") + + pynutil.insert("\"") + ) + graph_fortyfive = ( + pynini.cross("पावणे", "") + + delete_space + + pynutil.insert("hours: \"") + + hours_to + + pynutil.insert("\"") + + pynutil.insert(" ") + + pynutil.insert("minutes: \"") + + pynutil.insert("४५") + + pynutil.insert("\"") + ) + + special_cases = (pynini.cross("दीड", "") + pynutil.insert("hours: \"१\" minutes: \"३०\"")) | ( + pynini.cross("अडीच", "") + pynutil.insert("hours: \"२\" minutes: \"३०\"") + ) + + graph = pynini.union( + graph_time_full, graph_time_to, graph_fifteen, graph_thirty, graph_fortyfive, special_cases + ) + + final_graph = graph + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..0409b0a25 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/tokenize_and_classify.py @@ -0,0 +1,97 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.mr.graph_utils import ( + INPUT_LOWER_CASED, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.inverse_text_normalization.mr.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.mr.taggers.date import DateFst +from nemo_text_processing.inverse_text_normalization.mr.taggers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.mr.taggers.punctuation import PunctuationFst +from nemo_text_processing.inverse_text_normalization.mr.taggers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.mr.taggers.word import WordFst + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with whitelist replacements + input_case: accepting either "lower_cased" or "cased" input. + """ + + def __init__( + self, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + input_case: str = INPUT_LOWER_CASED, + ): + super().__init__(name="tokenize_and_classify", kind="classify") + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"mr_itn_{input_case}.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logging.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logging.info(f"Creating ClassifyFst grammars.") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + decimal_graph = DecimalFst(cardinal).fst + time_graph = TimeFst().fst + date_graph = DateFst(cardinal).fst + + word_graph = WordFst().fst + punct_graph = PunctuationFst().fst + classify = ( + pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(decimal_graph, 1.1) + | pynutil.add_weight(time_graph, 1.1) + | pynutil.add_weight(date_graph, 1.09) + | pynutil.add_weight(word_graph, 100) + ) + + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + token_plus_punct = ( + pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + ) + + graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) + graph = delete_space + graph + delete_space + + self.fst = graph.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) + logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/word.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/word.py new file mode 100644 index 000000000..2a062e5a3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/word.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.mr.graph_utils import NEMO_NOT_SPACE, GraphFst + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class. + e.g. चालणे -> tokens { name: "चालणे" } + """ + + def __init__(self): + super().__init__(name="word", kind="classify") + word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/mr/utils.py b/nemo_text_processing/inverse_text_normalization/mr/utils.py new file mode 100644 index 000000000..f7179e35b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/utils.py @@ -0,0 +1,60 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import os + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + +def load_labels(abs_path): + """ + loads relative path file as dictionary + + Args: + abs_path: absolute path + + Returns dictionary of mappings + """ + with open(abs_path, encoding="utf-8") as label_tsv: + labels = list(csv.reader(label_tsv, delimiter="\t")) + return labels + + +def augment_labels_with_punct_at_end(labels): + """ + augments labels: if key ends on a punctuation that value does not have, add a new label + where the value maintains the punctuation + + Args: + labels : input labels + Returns: + additional labels + """ + res = [] + for label in labels: + if len(label) > 1: + if label[0][-1] == "." and label[1][-1] != ".": + res.append([label[0], label[1] + "."] + label[2:]) + return res diff --git a/nemo_text_processing/inverse_text_normalization/mr/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/mr/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/cardinal.py new file mode 100644 index 000000000..5ca2f361c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/cardinal.py @@ -0,0 +1,53 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.mr.graph_utils import NEMO_DIGIT, GraphFst, delete_space + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal + e.g. cardinal { negative: "-" integer: "३३६२००" } : -३३६२०० + """ + + def __init__(self): + super().__init__(name="cardinal", kind="verbalize") + + optional_sign = pynini.closure( + pynutil.delete("negative:") + + delete_space + + pynutil.delete("\"") + + pynini.accep("-") + + pynutil.delete("\"") + + delete_space, + 0, + 1, + ) + graph = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT, 1) # Accepts at least one digit change nemo digit to whatever is relevant + + pynutil.delete("\"") + + delete_space + ) + # graph = optional_sign + graph # concatenates two properties + graph = optional_sign + graph + delete_tokens = self.delete_tokens(graph) # removes semiotic class tag + + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/mr/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/date.py new file mode 100644 index 000000000..617d55449 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/date.py @@ -0,0 +1,92 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.mr.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_extra_space, + delete_space, +) + + +class DateFst(GraphFst): + """ + Finite state transducer for verbalizing date, e.g. + e.g. date { month: "जानेवारी" year: "२०००" preserve_order: true } -> जानेवारी २००० + e.g. date { day: "१५" month: "फेब्रुवारी" year: "२०२३" } -> १५ फेब्रुवारी २०२३ + e.g. date { text: "इ.स.पू." year: "१९८५" preserve_order: true } -> इ.स.पू. १९८५ + """ + + def __init__(self): + super().__init__(name="date", kind="verbalize") + month = ( + pynutil.delete("month:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + day = ( + pynutil.delete("day:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + year = ( + pynutil.delete("year:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + delete_space + + pynutil.delete("\"") + ) + period = ( + pynutil.delete("text:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + graph_fy = period + pynini.closure(delete_extra_space + year, 0, 1) + # month (day) year + graph_mdy = ( + month + pynini.closure(delete_extra_space + day, 0, 1) + pynini.closure(delete_extra_space + year, 0, 1) + ) + + # (day) month year + graph_dmy = ( + pynini.closure(day + delete_extra_space, 0, 1) + month + pynini.closure(delete_extra_space + year, 0, 1) + ) + + optional_preserve_order = pynini.closure( + pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space + | pynutil.delete("field_order:") + + delete_space + + pynutil.delete("\"") + + NEMO_NOT_QUOTE + + pynutil.delete("\"") + + delete_space + ) + + final_graph = (graph_mdy | year | graph_dmy | graph_fy) + delete_space + optional_preserve_order | ( + graph_mdy | year | graph_dmy | graph_fy + ) + delete_space + optional_preserve_order + + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/mr/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/decimal.py new file mode 100644 index 000000000..1976fae24 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/decimal.py @@ -0,0 +1,62 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.mr.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class DecimalFst(GraphFst): + """ + Finite state transducer for verbalizing decimal + e.g. decimal { integer_part: "३००" fractional_part: "०३१" } -> ३००.०३१ + e.g. decimal { negative: "true" integer_part: "७३" fractional_part: "५" quantity: "लाख" } -> -७३.५ लाख + e.g. decimal { integer_part: "००८" fractional_part: "५०" } -> ८.५० + """ + + def __init__(self): + super().__init__(name="decimal", kind="verbalize") + optional_sign = pynini.closure(pynini.cross("negative: \"true\"", "-") + delete_space, 0, 1) + integer = ( + pynutil.delete("integer_part:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + optional_integer = pynini.closure(integer + delete_space, 0, 1) + fractional = ( + pynutil.insert(".") + + pynutil.delete("fractional_part:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + optional_fractional = pynini.closure(fractional + delete_space, 0, 1) + quantity = ( + pynutil.delete("quantity:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + optional_quantity = pynini.closure(pynutil.insert(" ") + quantity + delete_space, 0, 1) + graph = optional_integer + optional_fractional + optional_quantity + self.numbers = graph + graph = optional_sign + graph + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/mr/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/time.py new file mode 100644 index 000000000..7cc99b311 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/time.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.mr.graph_utils import NEMO_DIGIT, GraphFst, delete_space + + +class TimeFst(GraphFst): + """ + Finite state transducer for verbalizing time, e.g. + e.g. time { hours: "४" minutes: "३०" } -> ०४:३० + e.g. time { hours: "११" minutes: "३०" } -> ११:३० + e.g. time { hours: "८" minutes: "१५" } -> ०८:१५ + """ + + def __init__(self): + super().__init__(name="time", kind="verbalize") + add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("०") + NEMO_DIGIT) + hour = ( + pynutil.delete("hours:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete("\"") + ) + minute = ( + pynutil.delete("minutes:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete("\"") + ) + graph = ( + (hour @ add_leading_zero_to_double_digit) + + delete_space + + pynutil.insert(":") + + (minute @ add_leading_zero_to_double_digit) + ) + + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/mr/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/verbalize.py new file mode 100644 index 000000000..ca729c37b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/verbalize.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.mr.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.mr.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.mr.verbalizers.date import DateFst +from nemo_text_processing.inverse_text_normalization.mr.verbalizers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.mr.verbalizers.time import TimeFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + """ + + def __init__(self): + super().__init__(name="verbalize", kind="verbalize") + cardinal_graph = CardinalFst().fst + decimal_graph = DecimalFst().fst + time_graph = TimeFst().fst + date_graph = DateFst().fst + graph = cardinal_graph | decimal_graph | time_graph | date_graph + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/mr/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/verbalize_final.py new file mode 100644 index 000000000..2b4dccc7e --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/verbalize_final.py @@ -0,0 +1,45 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.mr.graph_utils import GraphFst, delete_extra_space, delete_space +from nemo_text_processing.inverse_text_normalization.mr.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.mr.verbalizers.word import WordFst + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "दुपारचे" } tokens { time { hours: "२" minutes: "३०" } } tokens { name: "वाजले" } tokens { name: "आहेत" } -> दुपारचे 0२:३० वाजले आहेत + """ + + def __init__(self): + super().__init__(name="verbalize_final", kind="verbalize") + verbalize = VerbalizeFst().fst + word = WordFst().fst + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/mr/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/word.py new file mode 100644 index 000000000..f2bab8fa4 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/word.py @@ -0,0 +1,39 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.mr.graph_utils import ( + NEMO_CHAR, + NEMO_SIGMA, + GraphFst, + delete_space, +) + + +class WordFst(GraphFst): + """ + Finite state transducer for verbalizing plain tokens + e.g. tokens { name: "चालणे" } -> चालणे + """ + + def __init__(self): + super().__init__(name="word", kind="verbalize") + chars = pynini.closure(NEMO_CHAR - " ", 1) + char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + + self.fst = graph.optimize() diff --git a/tests/nemo_text_processing/mr/__init__.py b/tests/nemo_text_processing/mr/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/tests/nemo_text_processing/mr/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..e5ccff889 --- /dev/null +++ b/tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,33 @@ +शून्य~० +उणे शंभर~-१०० +उणे सत्तावीस~-२७ +तीनशे एक~३०१ +सातशे~७०० +सातशे बावीस~७२२ +तीन हजार~३००० +आठ हजार तीन~८००३ +पाच हजार नव्वद~५०९० +उणे दोन हजार चाळीस~-२०४० +एक हजार सातशे अडुसष्ठ~१७६८ +नऊ हजार दोनशे~९२०० +तीन हजार एकशे~३१०० +एकतीसशे~३१०० +अठ्ठावन्नशे एक~५८०१ +उणे बावन्नशे~-५२०० +पंचाहत्तरशे सोळा~७५१६ +तेहतीस हजार~३३००० +सत्त्याण्णव हजार तीन~९७००३ +एक्याऐंशी हजार एक्याऐंशी~८१०८१ +सतरा हजार सातशे~१७७०० +उणे छप्पन हजार~-५६००० +उणे त्र्याऐंशी हजार सातशे पाच~-८३७०५ +चार लाख अठरा हजार~४१८००० +सात लाख आठ~७००००८ +सत्तावन्न लाख अठ्ठ्यात्तर हजार तीनशे नव्व्याण्णव~५७७८३९९ +उणे नऊ लाख दहा~-९०००१० +आठ कोटी एकतीस लाख बारा हजार नऊशे त्रेचाळीस~८३११२९४३ +एक्याण्णव कोटी सत्तर लाख~९१७०००००० +तेहतीस कोटी तेहतीस लाख बासष्ट हजार~३३३३६२००० +तेहतीस कोटी तेहतीस लाख पंचवीस~३३३३०००२५ +उणे तीन कोटी तेहतीस लाख~-३३३००००० +उणे तेहतीस कोटी तेहतीस लाख~-३३३३००००० \ No newline at end of file diff --git a/tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..c4e0455d2 --- /dev/null +++ b/tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_date.txt @@ -0,0 +1,29 @@ +दहा जून दोन हजार~१० जून २००० +जानेवारी तेरा दोन हजार तेवीस~जानेवारी १३ २०२३ +जानेवारी दहा~जानेवारी १० +जानेवारी दोन हजार~जानेवारी २००० +पंधरा फेब्रुवारी अठराशे सतरा~१५ फेब्रुवारी १८१७ +इसवी सन बाराशे सत्याहत्तर~इ.स. १२७७ +इसवी सन पूर्व एकोणीसशे नव्व्याण्णव~इ.स.पू. १९९९ +ऑक्टोबर तेवीस सातशे आठ~ऑक्टोबर २३ ७०८ +फेब्रुवारी एकोणतीस~फेब्रुवारी २९ +इसवी सन सतराशे~इ.स. १७०० +मार्च बावीस एकोणीसशे अठ्ठ्याण्णव~मार्च २२ १९९८ +एप्रिल एक अकराशे अकरा~एप्रिल १ ११११ +बावीस मे एकोणीसशे एकसष्ट~२२ मे १९६१ +एकतीस जुलै एक हजार~३१ जुलै १००० +ऑगस्ट सात~ऑगस्ट ७ +सप्टेंबर सत्तावीस दोन हजार सहा~सप्टेंबर २७ २००६ +नऊ ऑक्टोबर दोन हजार दोन~९ ऑक्टोबर २००२ +नोव्हेंबर सोळाशे सत्तेचाळीस~नोव्हेंबर १६४७ +एकतीस डिसेंबर एकोणीसशे नव्व्याण्णव~३१ डिसेंबर १९९९ +मार्च~मार्च +एप्रिल अठराशे एक~एप्रिल १८०१ +इसवी सन पूर्व चारशे~इ.स.पू. ४०० +इसवी सन दोन हजार दहा~इ.स. २०१० +जुलै पंधरा~जुलै १५ +वीस ऑगस्ट एकोणीसशे~२० ऑगस्ट १९०० +चौदा सप्टेंबर~१४ सप्टेंबर +ऑक्टोबर चार~ऑक्टोबर ४ +तेरा नोव्हेंबर दोन हजार तेवीस~१३ नोव्हेंबर २०२३ +डिसेंबर पंचवीस~डिसेंबर २५ \ No newline at end of file diff --git a/tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_decimal.txt new file mode 100644 index 000000000..2ec11fefc --- /dev/null +++ b/tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_decimal.txt @@ -0,0 +1,29 @@ +तेहतीस पूर्णांक तीन~३३.३ +चारशे पूर्णांक शून्य पाच~४००.०५ +आठशे पूर्णांक शून्य तीन एक~८००.०३१ +दोन हजार पूर्णांक आठ एक दोन पाच~२०००.८१२५ +चार लाख~४ लाख +उणे बाहत्तर लाख~-७२ लाख +तेहतीस लाख सत्तावीस हजार आठशे वीस पूर्णांक नऊ~३३२७८२०.९ +एकेचाळीस पूर्णांक तीन लाख~४१.३ लाख +उणे एक्याण्णव पूर्णांक चार कोटी~-९१.४ कोटी +दहा पूर्णांक पाच सहा~१०.५६ +सात पूर्णांक सात~७.७ +पंचाहत्तर कोटी~७५ कोटी +उणे बारा पूर्णांक एक पाच कोटी~-१२.१५ कोटी +बावन्न पूर्णांक एक शून्य चार आठ पाच~५२.१०४८५ +उणे साठ हजार पूर्णांक नऊ नऊ नऊ शून्य~-६००००.९९९० +शून्य पूर्णांक आठ सात सहा~०.८७६ +उणे शून्य पूर्णांक एक दोन तीन~-०.१२३ +शंभर पूर्णांक नऊ~१००.९ +नव्व्याण्णव पूर्णांक नऊ नऊ नऊ~९९.९९९ +सतरा लाख~१७ लाख +उणे दोन पूर्णांक आठ एक दोन~-२.८१२ +सदुसष्ट पूर्णांक चार कोटी~६७.४ कोटी +उणे ऐंशी पूर्णांक एक सहा नऊ आठ लाख~-८०.१६९८ लाख +शंभर पूर्णांक एक दोन तीन चार पाच~१००.१२३४५ +शंभर कोटी~१०० कोटी +उणे सत्तर पूर्णांक पाच~-७०.५ +आठ पूर्णांक सहा लाख~८.६ लाख +पाच हजार दोनशे अडतीस पूर्णांक चार सहा~५२३८.४६ +चाळीस कोटी~४० कोटी \ No newline at end of file diff --git a/tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..ea31ca0a5 --- /dev/null +++ b/tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_time.txt @@ -0,0 +1,29 @@ +साडे चार~०४:३० +साडे अकरा~११:३० +सव्वा आठ~०८:१५ +सव्वा बारा~१२:१५ +पावणे सहा~०५:४५ +पावणे दहा~०९:४५ +अकराला पाच मिनिटे~१०:५५ +आठला सात मिनिटे~०७:५३ +अकरा वाजून दोन मिनिटे~११:०२ +अकराला दोन मिनिट~१०:५८ +बारा वाजून पाच मिनिटे~१२:०५ +एक वाजून सात मिनिटे~०१:०७ +दीड~०१:३० +अडीच~०२:३० +नऊ वाजून वीस मिनिटे~०९:२० +सातला बारा मिनिटे~०६:४८ +पावणे अकरा~१०:४५ +दहा वाजून दहा मिनिटे~१०:१० +एकला एक मिनिट~१२:५९ +बारा वाजून पन्नास मिनिटे~१२:५० +चार वाजून पंचेचाळीस मिनिटे~०४:४५ +पावणे पाच~०४:४५ +पावणे एक~१२:४५ +सव्वा एक~०१:१५ +दहाला एक मिनिट~०९:५९ +सहा वाजून एक मिनिट~०६:०१ +नऊ वाजून तीस मिनिटे~०९:३० +दहाला पंधरा मिनिटे~०९:४५ +दोन वाजून अठ्ठेचाळीस मिनिटे~०२:४८ \ No newline at end of file diff --git a/tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_word.txt new file mode 100644 index 000000000..3b7ef04e3 --- /dev/null +++ b/tests/nemo_text_processing/mr/data_inverse_text_normalization/test_cases_word.txt @@ -0,0 +1,11 @@ +~ +, मी~, मी +म ~म +मी~मी +आम्ही~आम्ही +तिथे~तिथे +थोडे~थोडे +आई~आई +वडील~वडील +झाड~झाड +पिणे~पिणे \ No newline at end of file diff --git a/tests/nemo_text_processing/mr/test_cardinal.py b/tests/nemo_text_processing/mr/test_cardinal.py new file mode 100644 index 000000000..e7bd452fd --- /dev/null +++ b/tests/nemo_text_processing/mr/test_cardinal.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + inverse_normalizer_mr = InverseNormalizer(lang='mr', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('mr/data_inverse_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_mr.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/mr/test_date.py b/tests/nemo_text_processing/mr/test_date.py new file mode 100644 index 000000000..4ad5eb74d --- /dev/null +++ b/tests/nemo_text_processing/mr/test_date.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDate: + inverse_normalizer_mr = InverseNormalizer(lang='mr', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('mr/data_inverse_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_mr.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/mr/test_decimal.py b/tests/nemo_text_processing/mr/test_decimal.py new file mode 100644 index 000000000..e6f7d2d41 --- /dev/null +++ b/tests/nemo_text_processing/mr/test_decimal.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDecimal: + inverse_normalizer_mr = InverseNormalizer(lang='mr', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('mr/data_inverse_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_mr.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/mr/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/mr/test_sparrowhawk_inverse_text_normalization.sh new file mode 100644 index 000000000..9166db713 --- /dev/null +++ b/tests/nemo_text_processing/mr/test_sparrowhawk_inverse_text_normalization.sh @@ -0,0 +1,49 @@ +#! /bin/sh + +PROJECT_DIR=/workspace/tests + +runtest () { + input=$1 + cd /workspace/sparrowhawk/documentation/grammars + + # read test file + while read testcase; do + IFS='~' read spoken written <<< $testcase + denorm_pred=$(echo $spoken | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + + # trim white space + written="$(echo -e "${written}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$spoken" "$written" "$denorm_pred" + done < "$input" +} + +testITNCardinal() { + input=$PROJECT_DIR/mr/data_inverse_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testITNDecimal() { + input=$PROJECT_DIR/mr/data_inverse_text_normalization/test_cases_decimal.txt + runtest $input +} + +testITNTime() { + input=$PROJECT_DIR/mr/data_inverse_text_normalization/test_cases_time.txt + runtest $input +} + +testITNDate() { + input=$PROJECT_DIR/mr/data_inverse_text_normalization/test_cases_date.txt + runtest $input +} + +testITNWord() { + input=$PROJECT_DIR/mr/data_inverse_text_normalization/test_cases_word.txt + runtest $input +} + +# Load shUnit2 +. $PROJECT_DIR/../shunit2/shunit2 diff --git a/tests/nemo_text_processing/mr/test_time.py b/tests/nemo_text_processing/mr/test_time.py new file mode 100644 index 000000000..5571a5349 --- /dev/null +++ b/tests/nemo_text_processing/mr/test_time.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTime: + inverse_normalizer_mr = InverseNormalizer(lang='mr', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('mr/data_inverse_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_mr.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/mr/test_word.py b/tests/nemo_text_processing/mr/test_word.py new file mode 100644 index 000000000..263fe01a3 --- /dev/null +++ b/tests/nemo_text_processing/mr/test_word.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWord: + inverse_normalizer_mr = InverseNormalizer(lang='mr', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('mr/data_inverse_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_mr.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tools/text_processing_deployment/export_grammars.sh b/tools/text_processing_deployment/export_grammars.sh index 95f4edafb..69c4569b8 100644 --- a/tools/text_processing_deployment/export_grammars.sh +++ b/tools/text_processing_deployment/export_grammars.sh @@ -32,7 +32,7 @@ GRAMMARS="itn_grammars" # tn_grammars INPUT_CASE="lower_cased" # cased -LANGUAGE="en" # language, {'en', 'es', 'de','zh'} supports both TN and ITN, {'pt', 'ru', 'fr', 'vi'} supports ITN only +LANGUAGE="en" # language, {'en', 'es', 'de','zh'} supports both TN and ITN, {'pt', 'ru', 'fr', 'vi', 'mr'} supports ITN only MODE="export" # default is one of {'export', 'interactive', 'test', 'ci'}. Default "export" OVERWRITE_CACHE="True" # Set to False to re-use .far files FORCE_REBUILD="False" # Set to True to re-build docker file diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 22b455236..c40d5d6d0 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -80,7 +80,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=["en", "de", "es", "pt", "ru", 'fr', 'hu', 'sv', 'vi', 'zh', 'ar', 'it', 'es_en', 'hy'], + choices=["en", "de", "es", "pt", "ru", 'fr', 'hu', 'sv', 'vi', 'zh', 'ar', 'it', 'es_en', 'hy', 'mr'], type=str, default='en', ) @@ -111,7 +111,7 @@ def parse_args(): if __name__ == '__main__': args = parse_args() - if args.language in ['pt', 'ru', 'vi', 'es_en'] and args.grammars == 'tn_grammars': + if args.language in ['pt', 'ru', 'vi', 'es_en', 'mr'] and args.grammars == 'tn_grammars': raise ValueError('Only ITN grammars could be deployed in Sparrowhawk for the selected languages.') if args.language == 'en': @@ -228,6 +228,13 @@ def parse_args(): from nemo_text_processing.inverse_text_normalization.es_en.verbalizers.verbalize import ( VerbalizeFst as ITNVerbalizeFst, ) + elif args.language == 'mr': + from nemo_text_processing.inverse_text_normalization.mr.taggers.tokenize_and_classify import ( + ClassifyFst as ITNClassifyFst, + ) + from nemo_text_processing.inverse_text_normalization.mr.verbalizers.verbalize import ( + VerbalizeFst as ITNVerbalizeFst, + ) elif args.language == 'hy': from nemo_text_processing.inverse_text_normalization.hy.taggers.tokenize_and_classify import ( ClassifyFst as ITNClassifyFst, From 28409b25a09604116007de7fb675a433bd7871c3 Mon Sep 17 00:00:00 2001 From: tbartley94 <90423858+tbartley94@users.noreply.github.com> Date: Wed, 13 Mar 2024 13:42:41 -0700 Subject: [PATCH 12/90] jenkins fix (#150) * jenkins fix Signed-off-by: Travis Bartley * removing armenian to troubleshoot jenkins Signed-off-by: Travis Bartley * removing armenian to troubleshoot jenkins Signed-off-by: Travis Bartley * missing _init_ for python Signed-off-by: Travis Bartley * mislabled cache Signed-off-by: Travis Bartley --------- Signed-off-by: Travis Bartley Signed-off-by: Alex Cui --- Jenkinsfile | 4 ++-- .../hy/data/numbers/__init__.py | 13 +++++++++++++ .../hy/data/ordinals/__init__.py | 13 +++++++++++++ .../hy/data/time/__init__.py | 13 +++++++++++++ .../text_normalization/hy/data/numbers/__init__.py | 13 +++++++++++++ .../text_normalization/hy/data/ordinal/__init__.py | 13 +++++++++++++ .../text_normalization/hy/data/time/__init__.py | 13 +++++++++++++ .../hy/taggers/tokenize_and_classify.py | 2 +- 8 files changed, 81 insertions(+), 3 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/hy/data/numbers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/data/ordinals/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hy/data/time/__init__.py create mode 100644 nemo_text_processing/text_normalization/hy/data/numbers/__init__.py create mode 100644 nemo_text_processing/text_normalization/hy/data/ordinal/__init__.py create mode 100644 nemo_text_processing/text_normalization/hy/data/time/__init__.py diff --git a/Jenkinsfile b/Jenkinsfile index d671a53c0..f9a225b27 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -252,7 +252,7 @@ pipeline { parallel { stage('L0: MR ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=mr --text="शून्य" --cache_dir ${MR_ITN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=mr --text="शून्य " --cache_dir ${MR_TN_CACHE}' } } stage('L0: HY TN grammars') { @@ -262,7 +262,7 @@ pipeline { } stage('L0: HY ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hy --text="վեց" --cache_dir ${HY_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hy --text="վեց " --cache_dir ${HY_TN_CACHE}' } } } diff --git a/nemo_text_processing/inverse_text_normalization/hy/data/numbers/__init__.py b/nemo_text_processing/inverse_text_normalization/hy/data/numbers/__init__.py new file mode 100644 index 000000000..9df65818d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/data/numbers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/hy/data/ordinals/__init__.py b/nemo_text_processing/inverse_text_normalization/hy/data/ordinals/__init__.py new file mode 100644 index 000000000..9df65818d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/data/ordinals/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/hy/data/time/__init__.py b/nemo_text_processing/inverse_text_normalization/hy/data/time/__init__.py new file mode 100644 index 000000000..9df65818d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hy/data/time/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hy/data/numbers/__init__.py b/nemo_text_processing/text_normalization/hy/data/numbers/__init__.py new file mode 100644 index 000000000..9df65818d --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/data/numbers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hy/data/ordinal/__init__.py b/nemo_text_processing/text_normalization/hy/data/ordinal/__init__.py new file mode 100644 index 000000000..9df65818d --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/data/ordinal/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hy/data/time/__init__.py b/nemo_text_processing/text_normalization/hy/data/time/__init__.py new file mode 100644 index 000000000..9df65818d --- /dev/null +++ b/nemo_text_processing/text_normalization/hy/data/time/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hy/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hy/taggers/tokenize_and_classify.py index 08e121f86..a4562b436 100644 --- a/nemo_text_processing/text_normalization/hy/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hy/taggers/tokenize_and_classify.py @@ -61,7 +61,7 @@ def __init__( far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) - far_file = os.path.join(cache_dir, f"_hy_itn_{input_case}.far") + far_file = os.path.join(cache_dir, f"_hy_tn_{input_case}.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logger.info(f"ClassifyFst.fst was restored from {far_file}.") From 0032b2b810513c89481689db4ce51203a06b83c6 Mon Sep 17 00:00:00 2001 From: Evelina <10428420+ekmb@users.noreply.github.com> Date: Wed, 13 Mar 2024 14:58:07 -0700 Subject: [PATCH 13/90] r0.3.0 release (#151) Signed-off-by: Evelina Signed-off-by: Alex Cui --- nemo_text_processing/package_info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/package_info.py b/nemo_text_processing/package_info.py index 28e1c8984..2638d7176 100644 --- a/nemo_text_processing/package_info.py +++ b/nemo_text_processing/package_info.py @@ -14,8 +14,8 @@ MAJOR = 0 -MINOR = 2 -PATCH = 2 +MINOR = 3 +PATCH = 0 PRE_RELEASE = 'rc0' # Use the following formatting: (major, minor, patch, pre-release) From 5bbab9c5d741434fce3d9546ff3640b924933e3d Mon Sep 17 00:00:00 2001 From: Sasha Meister <117230141+ssh-meister@users.noreply.github.com> Date: Tue, 19 Mar 2024 15:51:04 +0100 Subject: [PATCH 14/90] Fix text=line[text] to text=line[text_field] (#153) Signed-off-by: Sasha Meister Signed-off-by: Alex Cui --- nemo_text_processing/text_normalization/normalize_with_audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py index 749041eb1..6a61efd4e 100644 --- a/nemo_text_processing/text_normalization/normalize_with_audio.py +++ b/nemo_text_processing/text_normalization/normalize_with_audio.py @@ -291,7 +291,7 @@ def normalize_line( line = json.loads(line) normalized_text = self.normalize( - text=line["text"], + text=line[text_field], verbose=verbose, n_tagged=n_tagged, punct_post_process=punct_post_process, From 86b19045f4aa80f44a957485f0e07ce336413fd6 Mon Sep 17 00:00:00 2001 From: kevsan4 <65792419+kevsan4@users.noreply.github.com> Date: Fri, 29 Mar 2024 17:43:36 -0700 Subject: [PATCH 15/90] use real string on docstring (#157) Signed-off-by: Kevin Sanders Signed-off-by: Alex Cui --- nemo_text_processing/text_normalization/normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index bccb5ca06..14093dadf 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -517,7 +517,7 @@ def _process_batch( logger.warning(f'Normalized version saved at {output_filename}') def split_text_into_sentences(self, text: str, additional_split_symbols: str = "") -> List[str]: - """ + r""" Split text into sentences. Args: From 76f415c549202b49714651de2048eafd03f76762 Mon Sep 17 00:00:00 2001 From: anand-nv <105917641+anand-nv@users.noreply.github.com> Date: Wed, 17 Apr 2024 00:34:38 +0530 Subject: [PATCH 16/90] Sh postprocess (#147) * Add support for postprocessor far in sparrowhawk Signed-off-by: Anand Joseph * Cleanup Signed-off-by: Anand Joseph * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Choose between having a post processor or not Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> --------- Signed-off-by: Anand Joseph Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui --- .../en/test_sparrowhawk_normalization.sh | 3 +++ tools/text_processing_deployment/Dockerfile | 3 +-- tools/text_processing_deployment/docker/launch.sh | 2 +- tools/text_processing_deployment/pynini_export.py | 10 +++++++++- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh index 7374b9ef9..3d5f7ae19 100644 --- a/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh @@ -11,7 +11,10 @@ runtest () { while read testcase; do IFS='~' read written spoken <<< $testcase # replace non breaking space with breaking space + # Use below if postprocessor is not used. Comment if it is used denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + # Use below if postprocessor is used. Comment if it is not used + #denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration_pp.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') # trim white space spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" diff --git a/tools/text_processing_deployment/Dockerfile b/tools/text_processing_deployment/Dockerfile index 0fa7d855b..a3badfde5 100644 --- a/tools/text_processing_deployment/Dockerfile +++ b/tools/text_processing_deployment/Dockerfile @@ -32,7 +32,6 @@ RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/pr RUN tar xzvf protobuf-2.5.0.tar.gz RUN cd protobuf-2.5.0 && ./configure && make && make install && ldconfig RUN conda install -c conda-forge thrax=1.3.4 -y -RUN git clone https://github.com/yzhang123/sparrowhawk.git -RUN cd sparrowhawk && git checkout test && apt-get install -y autoconf && bash autoreconf && ./configure && make && make install && ldconfig +RUN git clone https://github.com/anand-nv/sparrowhawk.git && cd sparrowhawk && git checkout nemo_tests && apt-get install -y autoconf && bash autoreconf && ./configure && make && make install && ldconfig RUN git clone https://github.com/kward/shunit2.git RUN echo "DONE" diff --git a/tools/text_processing_deployment/docker/launch.sh b/tools/text_processing_deployment/docker/launch.sh index 98fdff534..dea998f7b 100644 --- a/tools/text_processing_deployment/docker/launch.sh +++ b/tools/text_processing_deployment/docker/launch.sh @@ -50,7 +50,7 @@ elif [[ $MODE == "test_itn_grammars" ]]; then fi echo $MOUNTS -docker run -it --rm \ +docker run -it -e LANG=C.UTF-8 -e LC_ALL=C.UTF-8 --rm \ --shm-size=4g \ --ulimit memlock=-1 \ --ulimit stack=67108864 \ diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index c40d5d6d0..7969ee239 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -52,6 +52,8 @@ def tn_grammars(**kwargs): ).fst } d['verbalize'] = {'ALL': TNVerbalizeFst(deterministic=True).fst, 'REDUP': pynini.accep("REDUP")} + if TNPostProcessingFst is not None: + d['post_process'] = {'POSTPROCESSOR': TNPostProcessingFst().fst} return d @@ -66,6 +68,8 @@ def export_grammars(output_dir, grammars): for category, graphs in grammars.items(): out_dir = os.path.join(output_dir, category) + if category == "post_process": + out_dir = os.path.join(output_dir, "verbalize") if not os.path.exists(out_dir): os.makedirs(out_dir) time.sleep(1) @@ -113,7 +117,7 @@ def parse_args(): if args.language in ['pt', 'ru', 'vi', 'es_en', 'mr'] and args.grammars == 'tn_grammars': raise ValueError('Only ITN grammars could be deployed in Sparrowhawk for the selected languages.') - + TNPostProcessingFst = None if args.language == 'en': from nemo_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import ( ClassifyFst as ITNClassifyFst, @@ -124,7 +128,11 @@ def parse_args(): from nemo_text_processing.text_normalization.en.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, ) + from nemo_text_processing.text_normalization.en.verbalizers.post_processing import ( + PostProcessingFst as TNPostProcessingFst, + ) from nemo_text_processing.text_normalization.en.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst + elif args.language == 'de': from nemo_text_processing.inverse_text_normalization.de.taggers.tokenize_and_classify import ( ClassifyFst as ITNClassifyFst, From dea043945157b8aed1a4136b4b276d597ee330e8 Mon Sep 17 00:00:00 2001 From: Mariana <47233618+mgrafu@users.noreply.github.com> Date: Wed, 24 Apr 2024 21:06:58 -0400 Subject: [PATCH 17/90] update run_evaluate script for cased itn (#164) * update run_evaluate script for cased itn Signed-off-by: Mariana Graterol Fuenmayor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Mariana Graterol Fuenmayor Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui --- .../inverse_text_normalization/run_evaluate.py | 17 ++++++++++++++--- .../text_normalization/data_loader_utils.py | 11 ++++++----- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 29fc935cf..925be6e58 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -33,8 +33,14 @@ def parse_args(): parser = ArgumentParser() parser.add_argument("--input", help="input file path", type=str) parser.add_argument( - "--lang", help="language", choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'vi', 'hy'], default="en", type=str + "--lang", + help="language", + choices=["ar", "de", "en", "es", "es_en", "fr", "hy", "mr", "pt", "ru", "sv", "vi", "zh"], + default="en", + type=str, ) + parser.add_argument("--input_case", choices=["lower_cased", "cased"]) + parser.add_argument("--output_case", choices=["lower_cased", "cased"]) parser.add_argument( "--cat", dest="category", @@ -54,10 +60,15 @@ def parse_args(): if args.lang == 'en': from nemo_text_processing.inverse_text_normalization.en.clean_eval_data import filter_loaded_data file_path = args.input - inverse_normalizer = InverseNormalizer(lang=args.lang) + inverse_normalizer = InverseNormalizer(lang=args.lang, input_case=args.input_case) print("Loading training data: " + file_path) - training_data = load_files([file_path]) + if args.output_case == "lower_cased": + to_lower = True + elif args.output_case == "cased": + to_lower = False + + training_data = load_files([file_path], to_lower=to_lower) if args.filter: training_data = filter_loaded_data(training_data) diff --git a/nemo_text_processing/text_normalization/data_loader_utils.py b/nemo_text_processing/text_normalization/data_loader_utils.py index b13851313..47fdc4e6b 100644 --- a/nemo_text_processing/text_normalization/data_loader_utils.py +++ b/nemo_text_processing/text_normalization/data_loader_utils.py @@ -46,7 +46,7 @@ ] -def _load_kaggle_text_norm_file(file_path: str) -> List[Instance]: +def _load_kaggle_text_norm_file(file_path: str, to_lower: bool) -> List[Instance]: """ https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish Loads text file in the Kaggle Google text normalization file format: \t\t<`self` if trivial class or normalized text> @@ -76,8 +76,9 @@ def _load_kaggle_text_norm_file(file_path: str) -> List[Instance]: res.append(Instance(token_type=EOS_TYPE, un_normalized="", normalized="")) else: l_type, l_token, l_normalized = parts - l_token = l_token.lower() - l_normalized = l_normalized.lower() + if to_lower: + l_token = l_token.lower() + l_normalized = l_normalized.lower() if l_type == PLAIN_TYPE: res.append(Instance(token_type=l_type, un_normalized=l_token, normalized=l_token)) @@ -86,7 +87,7 @@ def _load_kaggle_text_norm_file(file_path: str) -> List[Instance]: return res -def load_files(file_paths: List[str], load_func=_load_kaggle_text_norm_file) -> List[Instance]: +def load_files(file_paths: List[str], load_func=_load_kaggle_text_norm_file, to_lower: bool = True) -> List[Instance]: """ Load given list of text files using the `load_func` function. @@ -98,7 +99,7 @@ def load_files(file_paths: List[str], load_func=_load_kaggle_text_norm_file) -> """ res = [] for file_path in file_paths: - res.extend(load_func(file_path=file_path)) + res.extend(load_func(file_path=file_path, to_lower=to_lower)) return res From 400c9fb5a3d5447f30a62b55e3d7feb17f7d6131 Mon Sep 17 00:00:00 2001 From: Mariana <47233618+mgrafu@users.noreply.github.com> Date: Thu, 25 Apr 2024 18:07:48 -0400 Subject: [PATCH 18/90] remove unused function from ar tn decimals (#165) * remove unused function from ar tn decimals Signed-off-by: Mariana Graterol Fuenmayor * update ci date Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- .../text_normalization/ar/taggers/decimal.py | 25 ------------------- 2 files changed, 1 insertion(+), 26 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index f9a225b27..75cdaa17f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -11,7 +11,7 @@ pipeline { } environment { - AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-23-23-0' + AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0' DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-18-23-0' ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0' diff --git a/nemo_text_processing/text_normalization/ar/taggers/decimal.py b/nemo_text_processing/text_normalization/ar/taggers/decimal.py index 1d777f1b6..f276155e9 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/ar/taggers/decimal.py @@ -19,31 +19,6 @@ from nemo_text_processing.text_normalization.ar.utils import get_abs_path -def get_quantity(decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike") -> "pynini.FstLike": - """ - Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, - e.g. 5 مليون -> integer_part: "خمسة" quantity: "مليون" - e.g. 5.4 مليون -> integer_part: "خمسة" fractional_part: "اربعة من عشرة" quantity: "مليون" - - Args: - decimal: decimal FST - cardinal_up_to_hundred: cardinal FST - """ - numbers = cardinal_up_to_hundred - - res = ( - pynutil.insert('integer_part: "') - + numbers - + pynutil.insert('"') - + pynini.accep(" ") - + pynutil.insert('quantity: "') - + quantities - + pynutil.insert('"') - ) - res |= decimal + pynini.accep(" ") + pynutil.insert('quantity: "') + quantities + pynutil.insert('"') - return res - - class DecimalFst(GraphFst): """ Finite state transducer for classifying decimal, e.g. From 36fa3af40552bbba2ca363dbef05014d06990b03 Mon Sep 17 00:00:00 2001 From: "Buyuan(Alex) Cui" <69030297+BuyuanCui@users.noreply.github.com> Date: Tue, 30 Apr 2024 13:10:05 -0700 Subject: [PATCH 19/90] ZH sentence-level TN (#112) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Swedish telephone fix (#60) * port fix for telephone from swedish-itn branch Signed-off-by: Jim O'Regan * extend cardinal in non-deterministic mode Signed-off-by: Jim O'Regan * whitespace fixes Signed-off-by: Jim O'Regan * also fix in the verbaliser Signed-off-by: Jim O'Regan * Update Jenkinsfile Signed-off-by: Jim O’Regan --------- Signed-off-by: Jim O'Regan Signed-off-by: Jim O’Regan Signed-off-by: Alex Cui * log instead of print in graph_utils.py (#68) Signed-off-by: Enno Hermann Signed-off-by: Alex Cui * CER estimation speedup for audio-based text normalization (#73) * Replaced jiwer with editdistance to speed up CER estimation Signed-off-by: Vitaly Lavrukhin * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Vitaly Lavrukhin Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * add measure coverage for TN and ITN (#62) * add measure coverage for TN and ITN Signed-off-by: ealbasiri * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove unused imports Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove unused imports Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> * Remove unused imports Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update measure.py Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> --------- Signed-off-by: ealbasiri Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Signed-off-by: Alex Cui * upload es-ES, es-LA, fr-FR and it-IT g2p dicts (#63) * upload es-ES and fr-FR g2p dicts Signed-off-by: Mariana Graterol Fuenmayor * add inits Signed-off-by: Mariana Graterol Fuenmayor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add NALA Spanish dict Signed-off-by: Mariana Graterol Fuenmayor * rename Spanish and French dictionaries Signed-off-by: Mariana Graterol Fuenmayor * add Italian dictionary Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * add country codes from hu (#77) Signed-off-by: Jim O'Regan Signed-off-by: Alex Cui * fix electronic case for username (#75) * fix electronic username w/o . Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * disable sv tests Signed-off-by: Evelina * disable sv tests Signed-off-by: Evelina * fix ar test Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * disable sv tests Signed-off-by: Evelina * update ci dirs, enable sv tests Signed-off-by: Evelina --------- Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * 0.1.8 release (#79) Signed-off-by: Evelina Signed-off-by: Alex Cui * Codeswitched ES/EN ITN (#78) * Initial commit for ES-EN codeswitched ITN Signed-off-by: Anand Joseph * Enable export for es_en codeswitched ITN Signed-off-by: Anand Joseph * Add whitelist, update weights Signed-off-by: Anand Joseph * Add tests for en_es, zone tagged separately in es Signed-off-by: Anand Joseph * Fix path to test data for sparrowhawk tests Signed-off-by: Anand Joseph * Update Jenkinsfile - enable ES/EN tests Signed-off-by: Anand Joseph * Add __init__.py files Signed-off-by: Anand Joseph * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix issues with failed docker build - due to archiving of debian and issues with re2 Signed-off-by: Anand Joseph * Remove unused imports and variables Signed-off-by: Anand Joseph * Update date Signed-off-by: Anand Joseph * Enable NBSP in sparrowhawk tests Signed-off-by: Anand Joseph * Update copyrights Signed-off-by: Anand Joseph * Update cache path in for ES/EN CI/CD Signed-off-by: Anand Joseph --------- Signed-off-by: Anand Joseph Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * electronic verbalizer fallback (#81) * 0.1.8 release Signed-off-by: Evelina * add elec fallback Signed-off-by: Evelina * update ci Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * minor normalize.py edit for usability (#84) * electronic verbalizer fallback (#81) * 0.1.8 release Signed-off-by: Evelina * add elec fallback Signed-off-by: Evelina * update ci Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Linnea Pari Leaver * documentation edits for grammar/clarity Signed-off-by: Linnea Pari Leaver * added --output_field flag for command line interface Signed-off-by: Linnea Pari Leaver * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Evelina Signed-off-by: Linnea Pari Leaver Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Linnea Pari Leaver Signed-off-by: Alex Cui * Swedish ITN (#40) * force two digits for month Signed-off-by: Jim O'Regan * put it in a function, because I reject the garbage pre-commit.ci came up with Signed-off-by: Jim O'Regan * wrap some more pieces Signed-off-by: Jim O'Regan * add graph pieces Signed-off-by: Jim O'Regan * delete junk Signed-off-by: Jim O'Regan * my copyright Signed-off-by: Jim O'Regan * add date verbaliser (copy from es) Signed-off-by: Jim O'Regan * tweaks Signed-off-by: Jim O'Regan * add date verbaliser Signed-off-by: Jim O'Regan * add right tokens Signed-off-by: Jim O'Regan * some tweaks, more needed Signed-off-by: Jim O'Regan * basic test cases Signed-off-by: Jim O'Regan * tweaks to TN date tagger Signed-off-by: Jim O'Regan * tweaks to ITN date tagger Signed-off-by: Jim O'Regan * tweaks to TN date tagger Signed-off-by: Jim O'Regan * remove duplicate Signed-off-by: Jim O'Regan * moved to tagger Signed-off-by: Jim O'Regan * nothing actually fixed here Signed-off-by: Jim O'Regan * now most tests pass Signed-off-by: Jim O'Regan * electronic Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fractions Signed-off-by: Jim O'Regan * extend Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * bare fractions is a bit of an overreach Signed-off-by: Jim O'Regan * whitelist Signed-off-by: Jim O'Regan * just inverting the TN whitelist tagger will not work/be useful Signed-off-by: Jim O'Regan * copy from English Signed-off-by: Jim O'Regan * overwrite with version from en Signed-off-by: Jim O'Regan * add basic test case Signed-off-by: Jim O'Regan * fix call Signed-off-by: Jim O'Regan * swap tsv sides Signed-off-by: Jim O'Regan * remove unused imports Signed-off-by: Jim O'Regan * add optional_era variable Signed-off-by: Jim O'Regan * add test case Signed-off-by: Jim O'Regan * make deterministic default, like most of the others Signed-off-by: Jim O'Regan * also add lowercase versions Signed-off-by: Jim O'Regan * replacing NEMO_SPACE does not work either Signed-off-by: Jim O'Regan * increasing weight... did not work last time Signed-off-by: Jim O'Regan * tweaking test cases, in case it was a sentence splitting issue. It was not Signed-off-by: Jim O'Regan * put the full stops back Signed-off-by: Jim O'Regan * add filler words Signed-off-by: Jim O'Regan * try splitting this out to see if it makes a difference Signed-off-by: Jim O'Regan * aha, this part should be non-deterministic only Signed-off-by: Jim O'Regan * single line only Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Revert "increasing weight... did not work last time" This reverts commit 39b020b50db745dfd6b281c8cbca45a033926996. Signed-off-by: Jim O'Regan * disabling ITN here makes TN work again(?) Signed-off-by: Jim O'Regan * Revert "disabling ITN here makes TN work again(?)" This reverts commit be49d7d5c687876e51c2e9ce1cf1e01491df280f. Signed-off-by: Jim O'Regan * changing the variable name fixes norm tests Signed-off-by: Jim O'Regan * change the variable names Signed-off-by: Jim O'Regan * add missing test tooling Signed-off-by: Jim O'Regan * copy telephone fixes from hu Signed-off-by: Jim O'Regan * copy telephone fixes from hu Signed-off-by: Jim O'Regan * add a piece for area codes for ITN Signed-off-by: Jim O'Regan * add country codes from hu Signed-off-by: Jim O'Regan * extend any_read_digit for ITN Signed-off-by: Jim O'Regan * country/area codes for ITN Signed-off-by: Jim O'Regan * first attempt Signed-off-by: Jim O'Regan * add to t&c Signed-off-by: Jim O'Regan * add to t&c Signed-off-by: Jim O'Regan * remove country codes for the time being, makes things ambiguous Signed-off-by: Jim O'Regan * basic test cases Signed-off-by: Jim O'Regan * fix Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove trailing whitespace Signed-off-by: Jim O'Regan * Update __init__.py Signed-off-by: Jim O’Regan * fix comment Signed-off-by: Jim O'Regan * fix comment Signed-off-by: Jim O'Regan * basic transform of TN tests Signed-off-by: Jim O'Regan * basic transformation of TN decimal tests Signed-off-by: Jim O'Regan * slight changes to date Signed-off-by: Jim O'Regan * tweak Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * include space Signed-off-by: Jim O'Regan * problem with tusen Signed-off-by: Jim O'Regan * problem with tusen was not that Signed-off-by: Jim O'Regan * add functions from hu Signed-off-by: Jim O'Regan * respect my own copyright xD Signed-off-by: Jim O'Regan * move data loading to constructor; had weirdness in this file, probably due to module-level python-suckage Signed-off-by: Jim O'Regan * move data loading, this has been an oddity before Signed-off-by: Jim O'Regan * try changing this year declaration Signed-off-by: Jim O'Regan * add year + era Signed-off-by: Jim O'Regan * eliminate more module-level data loading Signed-off-by: Jim O'Regan * Revert "eliminate more module-level data loading" This reverts commit 6a26e5d927817e1308e818758196924441ff7b3a. Signed-off-by: Jim O'Regan * expose variables Signed-off-by: Jim O'Regan * extra param for itn mode Signed-off-by: Jim O'Regan * change call Signed-off-by: Jim O'Regan * change comment Signed-off-by: Jim O'Regan * change comment Signed-off-by: Jim O'Regan * move data loading Signed-off-by: Jim O'Regan * fix parens Signed-off-by: Jim O'Regan * move data loading Signed-off-by: Jim O'Regan * adapt comments Signed-off-by: Jim O'Regan * adapt comments Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * adapt/extend tests Signed-off-by: Jim O'Regan * fix dict init/change keys to something useful Signed-off-by: Jim O'Regan * initial stab at prefixed numbers Signed-off-by: Jim O'Regan * some adapting Signed-off-by: Jim O'Regan * insert kl. if absent Signed-off-by: Jim O'Regan * fix comments Signed-off-by: Jim O'Regan * the relative prefixed times Signed-off-by: Jim O'Regan * + comments Signed-off-by: Jim O'Regan * enable time Signed-off-by: Jim O'Regan * space in both directions Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix comment Signed-off-by: Jim O'Regan * fix hours to Signed-off-by: Jim O'Regan * split by before/after Signed-off-by: Jim O'Regan * delete, not insert Signed-off-by: Jim O'Regan * fix if Signed-off-by: Jim O'Regan * kl. 9 Signed-off-by: Jim O'Regan * copy from en Signed-off-by: Jim O'Regan * keep only get_abs_path Signed-off-by: Jim O'Regan * imports Signed-off-by: Jim O'Regan * add trimmed file Signed-off-by: Jim O'Regan * fix imports Signed-off-by: Jim O'Regan * two abs_paths... could be fun Signed-off-by: Jim O'Regan * minutes/seconds Signed-off-by: Jim O'Regan * suffix Signed-off-by: Jim O'Regan * delete, not insert Signed-off-by: Jim O'Regan * one optional Signed-off-by: Jim O'Regan * export variable Signed-off-by: Jim O'Regan * kl. or one of suffix/zone Signed-off-by: Jim O'Regan * already disambiguated Signed-off-by: Jim O'Regan * closure Signed-off-by: Jim O'Regan * do not insert kl. Signed-off-by: Jim O'Regan * fix test case Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix spelling Signed-off-by: Jim O'Regan * Delete measure.py Signed-off-by: Jim O’Regan * Delete money.py Signed-off-by: Jim O’Regan * remove unused pieces Signed-off-by: Jim O'Regan * remove unused pieces Signed-off-by: Jim O'Regan * remove unused test pieces Signed-off-by: Jim O'Regan * copy from es Signed-off-by: Jim O'Regan * add SV ITN Signed-off-by: Jim O'Regan * add/update __init__ Signed-off-by: Jim O'Regan * blank line Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix comment Signed-off-by: Jim O'Regan * fix lang Signed-off-by: Jim O'Regan * fix decimal verbaliser Signed-off-by: Jim O'Regan * fix Signed-off-by: Jim O'Regan * remove year, conflicts with cardinal Signed-off-by: Jim O'Regan * space before, not after Signed-off-by: Jim O'Regan * fix cardinal tests Signed-off-by: Jim O'Regan * spurious deletion Signed-off-by: Jim O'Regan * fix comment Signed-off-by: Jim O'Regan * unused imports Signed-off-by: Jim O'Regan * re-enable SV TN; enable SV ITN Signed-off-by: Jim O'Regan * Revert "re-enable SV TN; enable SV ITN" This reverts commit 3ce4dfde1f70a89afc274284f6e4c737b3fac95b. Signed-off-by: Jim O'Regan * fix singulras Signed-off-by: Jim O'Regan * add an export Signed-off-by: Jim O'Regan * change integer graph Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * move spaces Signed-off-by: Jim O'Regan * use cdrewrite Signed-off-by: Jim O'Regan * just EOS/BOS Signed-off-by: Jim O'Regan * fix typo Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Jim O'Regan * omit en/ett, because they are also articles Signed-off-by: Jim O'Regan * uncomment Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * unused Signed-off-by: Jim O'Regan * strip spaces from decimal part Signed-off-by: Jim O'Regan * export Signed-off-by: Jim O'Regan * partial fix, not what I wanted Signed-off-by: Jim O'Regan * move comment Signed-off-by: Jim O'Regan * en/ett cannot work in itn case Signed-off-by: Jim O'Regan * be more deliberate in graph construction Signed-off-by: Jim O'Regan * accept both Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * +2 tests Signed-off-by: Jim O'Regan * (try to) accept singular quantities for plurals Signed-off-by: Jim O'Regan * retry Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * oops Signed-off-by: Jim O'Regan * replace Signed-off-by: Jim O'Regan * arcmap Signed-off-by: Jim O'Regan * version without ones Signed-off-by: Jim O'Regan * add another test Signed-off-by: Jim O'Regan * change graph Signed-off-by: Jim O'Regan * simplify Signed-off-by: Jim O'Regan * get rid of this, this is where it goes wrong Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * more tests Signed-off-by: Jim O'Regan * add a test Signed-off-by: Jim O'Regan * multiple states from both ones, try removing and readding Signed-off-by: Jim O'Regan * remove ones, see if that fixes at least the bare quantities Signed-off-by: Jim O'Regan * works in the repl, dunno why it still breaks Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove duplicate Signed-off-by: Jim O'Regan * move definition Signed-off-by: Jim O'Regan * simplify Signed-off-by: Jim O'Regan * tweak Signed-off-by: Jim O'Regan * another test Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * local declaration, seems to not be working Signed-off-by: Jim O'Regan * more tests Signed-off-by: Jim O'Regan * match verbaliser Signed-off-by: Jim O'Regan * fix last two failing tests Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add missing tests for telephone and word Signed-off-by: Jim O'Regan * remove unused variable Signed-off-by: Jim O'Regan * remove unused imports Signed-off-by: Jim O'Regan * fix comment Signed-off-by: Jim O'Regan * get rid of convert_space, tests fail Signed-off-by: Jim O'Regan * put convert_spaces back, change test file; pytest fails Signed-off-by: Jim O'Regan * Revert "put convert_spaces back, change test file; pytest fails" This reverts commit a7bb7489137b8026aab02aff64df39e874630043. Signed-off-by: Jim O'Regan * put convert_spaces back, change test file; pytest fails, take 2 Signed-off-by: Jim O'Regan * deliberately remove spaces rather than have a non-determinism that comes out differently in sparrowhawk Signed-off-by: Jim O'Regan * try converting the non-breaking spaces in the shell script Signed-off-by: Jim O'Regan * wrong place Signed-off-by: Jim O'Regan * fix typo Signed-off-by: Jim O'Regan * fix path Signed-off-by: Jim O'Regan * export Signed-off-by: Jim O'Regan * export Signed-off-by: Jim O'Regan * remove unused Signed-off-by: Jim O'Regan * Update date.py Signed-off-by: Jim O’Regan * Update time.py Signed-off-by: Jim O’Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix comment Signed-off-by: Jim O’Regan * trim comments Signed-off-by: Jim O’Regan * remove commented line Signed-off-by: Jim O’Regan * en halv Signed-off-by: Jim O’Regan * Update test_sparrowhawk_inverse_text_normalization.sh Signed-off-by: Jim O’Regan --------- Signed-off-by: Jim O'Regan Signed-off-by: Jim O’Regan Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * Italian_TN (#67) * add TN italian Signed-off-by: GiacomoLeoneMaria * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix init Signed-off-by: GiacomoLeoneMaria * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix LOCATION Signed-off-by: GiacomoLeoneMaria * modify graph_utils Signed-off-by: GiacomoLeoneMaria * correct decimals Signed-off-by: GiacomoLeoneMaria * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix electronic Signed-off-by: Giacomo Cavallini * fix electronic Signed-off-by: Giacomo Cavallini * fix measure Signed-off-by: Giacomo Cavallini --------- Signed-off-by: GiacomoLeoneMaria Signed-off-by: Giacomo Cavallini Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Signed-off-by: Alex Cui * Zh itn (#74) * Add ZH ITN Signed-off-by: Anand Joseph * Fix copyrights and code cleanup Signed-off-by: Anand Joseph * Remove invalid tests Signed-off-by: Anand Joseph * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Resolve CodeQL issues Signed-off-by: Anand Joseph * Cleanup Signed-off-by: Anand Joseph * Fix missing 'zh' option for ITN and correct comment Signed-off-by: Anand Joseph * Update __init__.py Change to zh instead of en for the imports. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update for decimal test data Signed-off-by: BuyuanCui * update for langauge import Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update for Chinese punctuations Signed-off-by: BuyuanCui * a new class for whitelist Signed-off-by: BuyuanCui * PYNINI_AVAILABLE = False Signed-off-by: BuyuanCui * recreated due to file import format issue Signed-off-by: BuyuanCui * recreated due to format issue Signed-off-by: BuyuanCui * caught duplicates, removed Signed-off-by: BuyuanCui * removed duplicates, arranges for CHInese Yuan updates Signed-off-by: BuyuanCui * updates accordingly to the comments from last PR. Recreated some of the files due to format issues Signed-off-by: BuyuanCui * removed the hours_to and minute_to files used for back counting. ALso removed am and pm suffix files according to the last PR. Recreated some of them for format issue Signed-off-by: BuyuanCui * re-added this file to avoid data file import error Signed-off-by: BuyuanCui * updated gramamr according to last PR. Removed the acceptance of 千 Signed-off-by: BuyuanCui * updates Signed-off-by: BuyuanCui * updated according to last PR. Removed comma after decimal points Signed-off-by: BuyuanCui * gramamr for Fraction Signed-off-by: BuyuanCui * gramamr for money and updated according to last PR. Plus process of 元 Signed-off-by: BuyuanCui * ordinal grammar. updates due to the updates in cardinal grammar Signed-off-by: BuyuanCui * updated accordingly to last PR comments. removing am and pm and allowing simple mandarin expression Signed-off-by: BuyuanCui * arrangements Signed-off-by: BuyuanCui * added whitelist grammar Signed-off-by: BuyuanCui * word grammar for non-classified items Signed-off-by: BuyuanCui * updated cardinal, decimal, time, itn data Signed-off-by: BuyuanCui * updates according to last PR Signed-off-by: BuyuanCui * updates according to the updates for cardinal grammar Signed-off-by: BuyuanCui * updates for more Mandarin punctuations Signed-off-by: BuyuanCui * updated accordingly to last PR. removing am pm Signed-off-by: BuyuanCui * adjustment on the weight Signed-off-by: BuyuanCui * updated accordingly to the targger updates Signed-off-by: BuyuanCui * updated accordingly to the time tagger Signed-off-by: BuyuanCui * updates according to changes in tagger on am and pm Signed-off-by: BuyuanCui * verbalizer for fraction Signed-off-by: BuyuanCui * added for mandarin grammar Signed-off-by: BuyuanCui * kept this file because using English utils results in data namin error Signed-off-by: BuyuanCui * merge conflict Signed-off-by: BuyuanCui * removed unsed imports Signed-off-by: BuyuanCui * deleted unsed import os Signed-off-by: BuyuanCui * deleted unsed variables Signed-off-by: BuyuanCui * removed unsed imports Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updates and edits based on pr checks Signed-off-by: BuyuanCui * updates and edits based on pr checks Signed-off-by: BuyuanCui * format issue, reccreated Signed-off-by: BuyuanCui * format issue recreated Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed codeing style/format Signed-off-by: BuyuanCui * fixed coding style and format Signed-off-by: BuyuanCui * removed duplicated graph for 毛 Signed-off-by: BuyuanCui * removed the comment Signed-off-by: BuyuanCui * removed the comment Signed-off-by: BuyuanCui * removing unnecessary comments Signed-off-by: BuyuanCui * unnecessary comment removed Signed-off-by: BuyuanCui * test file updated for more cases Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated with a comment explaining why this file is kept Signed-off-by: BuyuanCui * updated the file explaining why this file is kept Signed-off-by: BuyuanCui * added Mandarin as zh Signed-off-by: BuyuanCui * removing for dplication Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removed unused NEMO objects Signed-off-by: BuyuanCui * removed duplicates Signed-off-by: BuyuanCui * removing unsed imports Signed-off-by: BuyuanCui * updates to fix test file failures Signed-off-by: BuyuanCui * updates to fix file failtures Signed-off-by: BuyuanCui * updates to resolve test case failture Signed-off-by: BuyuanCui * updates to resolve test case failure Signed-off-by: BuyuanCui * updates to resolve test case failure Signed-off-by: BuyuanCui * updates to resolve test case failure Signed-off-by: BuyuanCui * updates to adap to cardinal grammar changes Signed-off-by: BuyuanCui * updates to adapt to grammar changes Signed-off-by: BuyuanCui * updates to adopt to cardinal grammar changes Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix style Signed-off-by: BuyuanCui * fix style Signed-off-by: BuyuanCui * fix style Signed-off-by: BuyuanCui * fix style Signed-off-by: BuyuanCui * fixing pr checks Signed-off-by: BuyuanCui * removed // for zhtn/itn cache Signed-off-by: BuyuanCui * Update inverse_normalize.py Added zh as a selection to pass Jenkins checks. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> --------- Signed-off-by: Anand Joseph Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> Signed-off-by: BuyuanCui Co-authored-by: Alex Cui Co-authored-by: Anand Joseph Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * updated pynini_export.py file to create far files (#88) Signed-off-by: BuyuanCui Signed-off-by: Alex Cui * readd Swedish (#87) Signed-off-by: Jim O'Regan Signed-off-by: Alex Cui * Zh tn 0712 (#89) * updates Signed-off-by: BuyuanCui * updates and fixings according to document on natonal gideline Signed-off-by: BuyuanCui * Decimal grammar added Signed-off-by: BuyuanCui * fraction updated Signed-off-by: BuyuanCui * money updated Signed-off-by: BuyuanCui * ordinal grammar added Signed-off-by: BuyuanCui * punctuation grammar added Signed-off-by: BuyuanCui * time gramamr updated Signed-off-by: BuyuanCui * tokenizaer updated Signed-off-by: BuyuanCui * updates on certificate Signed-off-by: BuyuanCui * data updated and added due to updates and chanegs to the existing grammar Signed-off-by: BuyuanCui * cardinal updated Signed-off-by: BuyuanCui * date grammar changed Signed-off-by: BuyuanCui * decimal grammar added Signed-off-by: BuyuanCui * grammar updated Signed-off-by: BuyuanCui * grammar updated Signed-off-by: BuyuanCui * grammar added Signed-off-by: BuyuanCui * grammar updates Signed-off-by: BuyuanCui * test data added Signed-off-by: BuyuanCui * test python file edits Signed-off-by: BuyuanCui * updates for tn1.0 and previous tn grammar from contribution Signed-off-by: BuyuanCui * test cases updated Signed-off-by: BuyuanCui * coding style fixed Signed-off-by: BuyuanCui * dates updated for init files Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated the date for zh Signed-off-by: BuyuanCui * removed unsed imports Signed-off-by: BuyuanCui * removed comments Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added back the itn tests Signed-off-by: BuyuanCui * added back measure and math from previou TN Signed-off-by: BuyuanCui * updated for tests reruns Signed-off-by: BuyuanCui * updats Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated weights Signed-off-by: BuyuanCui --------- Signed-off-by: BuyuanCui Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * Zh tn char (#95) * file name change Signed-off-by: BuyuanCui * file name change Signed-off-by: BuyuanCui * file name change Signed-off-by: BuyuanCui * file name change Signed-off-by: BuyuanCui * file name change Signed-off-by: BuyuanCui * file name Signed-off-by: BuyuanCui * file name Signed-off-by: BuyuanCui * file name Signed-off-by: BuyuanCui * file name Signed-off-by: BuyuanCui * file name Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * code stle Signed-off-by: BuyuanCui * fixed import error Signed-off-by: BuyuanCui --------- Signed-off-by: BuyuanCui Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * audio-based TN fix for empty pred_text/text (#92) * fix for empty pred_text Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add unittests Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix path Signed-off-by: Evelina * fix path Signed-off-by: Evelina * fix pytest Signed-off-by: Evelina --------- Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * pip 1.2.0 Signed-off-by: Evelina Signed-off-by: Alex Cui * French tn (#91) * add tests for fr tn Signed-off-by: Mariana Graterol Fuenmayor * add fr tn for cardinals, decimals, fractions and ordinals Signed-off-by: Mariana Graterol Fuenmayor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * delete it far files from tools Signed-off-by: Mariana Graterol Fuenmayor * add languages to run_evaluate Signed-off-by: Mariana Graterol Fuenmayor * remove ambiguous spacing Signed-off-by: Mariana Graterol Fuenmayor * enable sh testing for fr tn Signed-off-by: Mariana Graterol Fuenmayor * fix bug with ordinals Signed-off-by: Mariana Graterol Fuenmayor * update jenkinsfile cache date Signed-off-by: Mariana Graterol Fuenmayor * fix test for ordinals Signed-off-by: Mariana Graterol Fuenmayor * update tn cache for fr Signed-off-by: Mariana Graterol Fuenmayor * resolve codeql issues Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * Add whitelist_tech.tsv (#96) Signed-off-by: Anand Joseph Signed-off-by: Alex Cui * Zhitn 0727 (#93) * updates on itn grammar to pass sparrowhawk tests Signed-off-by: BuyuanCui * updats for sparrowhawk tests Signed-off-by: BuyuanCui * updates fro sparrowhawk tests Signed-off-by: BuyuanCui * coding style fix Signed-off-by: BuyuanCui * updates for coding style and sparrowhawk test Signed-off-by: BuyuanCui * updated classes for tests on whitelist and word grammar Signed-off-by: BuyuanCui * added for tests on whitelist Signed-off-by: BuyuanCui * added for test on word Signed-off-by: BuyuanCui * added to run test on whitelist Signed-off-by: BuyuanCui * added to run test on word Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update test_word.py Removed unused import. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_word.py Removed imports according to CodeQL Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_whitelist.py Removing imports according to CodeQL Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_whitelist.py Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update Jenkinsfile changed zh cache to 07-27-23 as it is the latest update. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> --------- Signed-off-by: BuyuanCui Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * Es tn romans fix (#98) * fix es tn roman exceptions Signed-off-by: Mariana Graterol Fuenmayor * update jenkinsfile Signed-off-by: Mariana Graterol Fuenmayor * update eval script for ITN Signed-off-by: Mariana Graterol Fuenmayor * codeql fix Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Alex Cui * Change docker image (#102) Change docker image to one including sparrowhawk Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Signed-off-by: Alex Cui * Print warning instead exception (#97) * raise text Signed-off-by: Nikolay Karpov * text arg Signed-off-by: Nikolay Karpov * Failed text Signed-off-by: Nikolay Karpov * add logger Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rm raise Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * logger Signed-off-by: Nikolay Karpov * NeMo-text-processing Signed-off-by: Nikolay Karpov * info level Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rm raise Signed-off-by: Nikolay Karpov * verbose Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Normalizer.select_verbalizer Signed-off-by: Nikolay Karpov * Exception Signed-off-by: Nikolay Karpov * verbose Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * restart ci Signed-off-by: Evelina --------- Signed-off-by: Nikolay Karpov Signed-off-by: Nikolay Karpov Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Nikolay Karpov Co-authored-by: Evelina Signed-off-by: Alex Cui * warning regardless of verbose flag (#107) * warning Signed-off-by: Nikolay Karpov * self.verbose Signed-off-by: Nikolay Karpov --------- Signed-off-by: Nikolay Karpov Signed-off-by: Alex Cui * Unpin setuptools (#106) Signed-off-by: Peter Plantinga Signed-off-by: Alex Cui * fixed warnings: File is not always closes. (#113) Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com> Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com> Signed-off-by: Alex Cui * fix bug #111 (ar currencies) (#117) * fix bug #111 (ar currencies) Signed-off-by: Mariana Graterol Fuenmayor * update ci folder Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Alex Cui * Logging clean up + IT TN fix (#118) * fix utils and it TN Signed-off-by: Evelina * clean up Signed-off-by: Evelina * fix logging Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix format Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix format Signed-off-by: Evelina * fix format Signed-off-by: Evelina * add IT TN to CI Signed-off-by: Evelina * update patch Signed-off-by: Evelina --------- Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * Time_IT_TN (#105) * add time verbalizer Signed-off-by: GiacomoLeoneMaria * add time tagger and verba Signed-off-by: GiacomoLeoneMaria * add pytest time Signed-off-by: GiacomoLeoneMaria * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * codeQL Signed-off-by: GiacomoLeoneMaria * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix numbers with eight Signed-off-by: GiacomoLeoneMaria --------- Signed-off-by: GiacomoLeoneMaria Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * IT TN improvement on tests (#120) * add missing test cases Signed-off-by: Mariana Graterol Fuenmayor * fix bug with time tests Signed-off-by: Mariana Graterol Fuenmayor * update ci date Signed-off-by: Mariana Graterol Fuenmayor * add sentence test cases Signed-off-by: Mariana Graterol Fuenmayor * refine shortest path for irregular cardinals Signed-off-by: Mariana Graterol Fuenmayor * update ci date Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Alex Cui * add single letter exception for roman numerals (#121) * add single letter exception for roman numerals Signed-off-by: Mariana Graterol Fuenmayor * update ci dir Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Alex Cui * rewrote tokenizer Signed-off-by: BuyuanCui Signed-off-by: Alex Cui * removed the file and replaced it with char in 1.8 Signed-off-by: BuyuanCui Signed-off-by: Alex Cui * jenkins file update Signed-off-by: BuyuanCui Signed-off-by: Alex Cui * to fix tn bug@ xuesong Signed-off-by: BuyuanCui Signed-off-by: Alex Cui * tn bug Signed-off-by: BuyuanCui Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Alex Cui * fixeds and updates Signed-off-by: BuyuanCui Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Alex Cui * adjustments Signed-off-by: BuyuanCui Signed-off-by: Alex Cui * testing commit Signed-off-by: Alex Cui * removing unsed file Signed-off-by: Alex Cui * updated test cases Signed-off-by: Alex Cui * updating etst cases Signed-off-by: Alex Cui * updates adapting to graphs Signed-off-by: Alex Cui * updated cases for SH tests Signed-off-by: Alex Cui * updated cases Signed-off-by: Alex Cui * added some sentences Signed-off-by: Alex Cui * test cases update Signed-off-by: Alex Cui * solving rebase issue, repushing changes Signed-off-by: Alex Cui * resolving conflict Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixings according to ci Signed-off-by: Alex Cui * fixings according to the ci Signed-off-by: Alex Cui * removed not used Signed-off-by: Alex Cui * notused removing Signed-off-by: Alex Cui * format issue Signed-off-by: Alex Cui * formt issue Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removing unused files Signed-off-by: Alex Cui * removing unused files Signed-off-by: Alex Cui * remiving unsed files; Signed-off-by: Alex Cui * removing unsed files Signed-off-by: Alex Cui * removing unsed files Signed-off-by: Alex Cui * added sentences as test cases Signed-off-by: Alex Cui * added senetnces as test cases Signed-off-by: Alex Cui * removed commentyed out tests Signed-off-by: Alex Cui * updating dates Signed-off-by: Alex Cui * attemps to fix bug Signed-off-by: Alex Cui * inprocess of fixing the bug Signed-off-by: Alex Cui * fixing existing issue Signed-off-by: Alex Cui * updated graph_utils, tokenize and classify, and word graphs Signed-off-by: Alex Cui * added bacl the ppostprocessor far creation Signed-off-by: Alex Cui * updated NEMO_NOT_ALPHA as a new variable Signed-off-by: Alex Cui * far files Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * combiedn into measure Signed-off-by: Alex Cui * removing and combined to meaasure Signed-off-by: Alex Cui * removing, not used Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updates to fix space issue Signed-off-by: Alex Cui * updates to fix space issue Signed-off-by: Alex Cui * updates to fix space issue Signed-off-by: Alex Cui * updates to solve the space issue Signed-off-by: Alex Cui * resolving sh issue Signed-off-by: Alex Cui * resolving sh test issue Signed-off-by: Alex Cui * adding anands updates Signed-off-by: Alex Cui * data updated for measure and whitelist Signed-off-by: Alex Cui * updates Signed-off-by: Alex Cui * updates Signed-off-by: Alex Cui * updates Signed-off-by: Alex Cui * removing fraction and math part Signed-off-by: Alex Cui * removing comments Signed-off-by: Alex Cui * removing preprocessor, updating measure, adding shitelist cases Signed-off-by: Alex Cui * removing processor, modification for sp test, shitelist and word Signed-off-by: Alex Cui * updating zh date Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * realized itn being cvommented out, adding back Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * trying to run zh tn separately because it takes long time to run Signed-off-by: Alex Cui * modification to ru zh tn separately Signed-off-by: Alex Cui * independent zh tnitn tests for more time Signed-off-by: Alex Cui * adding lines to save far file Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updates for reducing testing time Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * for ounct graph Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removing used graphs Signed-off-by: Alex Cui * format and removing used comments Signed-off-by: Alex Cui * removing this one, not used Signed-off-by: Alex Cui * remove unused commentss Signed-off-by: Alex Cui * removing unsed comments Signed-off-by: Alex Cui * removing unsed comments Signed-off-by: Alex Cui * removing comments Signed-off-by: Alex Cui * Delete tools/text_processing_deployment/zh directory Removing far files. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * updates according to the github comments Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removing comments Signed-off-by: Alex Cui * punct grammar Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update test_cases_cardinal.txt Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update Dockerfile Copied from main branch ( which included Anand's updates) Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update launch.sh Found differences in the file. Fixing it back. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_word.py Saw word ITN being commented out. Adding it back. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update money.py Found cardinal grammar not accepting suffix. Fixed it. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update Jenkinsfile Removed duplicated zh test from line 230s Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update utils.py Addressing bug raised in bug in graph_utils.py of zh ITN and decimal tagger of ar TN #162. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update graph_utils.py Addressing bug in graph_utils.py of zh ITN and decimal tagger of ar TN #162. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update measure.py Fixing code style, removing unused imports Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update word.py Fixing code style, removing unused imports Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update measure.py Removing unused import. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update post_processing.py Removing unused imports Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update post_processing.py Removing unused import Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update word.py Removing unused imports Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update cardinal.py Deleting unused graph Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update word.py Removing import pynini Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update word.py removing pynini import Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update verbalize.py removing pynutil import Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update post_processing.py removing punct graph imported Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_sparrowhawk_normalization.sh Update on test issue for Docker file locations Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_ordinal.py Fixing style. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Delete nemo_text_processing/text_normalization/zh/taggers/math_symbol.py Removing because it's not one of the semiotic classes. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Delete nemo_text_processing/text_normalization/zh/verbalizers/math_symbol.py Removing because it's not one of the semiotic classes. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update Jenkinsfile Updating Jenkins date Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> --------- Signed-off-by: Jim O'Regan Signed-off-by: Jim O’Regan Signed-off-by: Alex Cui Signed-off-by: Enno Hermann Signed-off-by: Vitaly Lavrukhin Signed-off-by: ealbasiri Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Evelina Signed-off-by: Anand Joseph Signed-off-by: Linnea Pari Leaver Signed-off-by: GiacomoLeoneMaria Signed-off-by: Giacomo Cavallini Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> Signed-off-by: BuyuanCui Signed-off-by: Nikolay Karpov Signed-off-by: Nikolay Karpov Signed-off-by: Peter Plantinga Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com> Co-authored-by: Jim O’Regan Co-authored-by: Enno Hermann Co-authored-by: Vitaly Lavrukhin Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Enas Albasiri <71229149+ealbasiri@users.noreply.github.com> Co-authored-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com> Co-authored-by: lleaver <137942999+lleaver@users.noreply.github.com> Co-authored-by: Linnea Pari Leaver Co-authored-by: Jim O’Regan Co-authored-by: Giacomo Leone Maria Cavallini <72698188+GiacomoLeoneMaria@users.noreply.github.com> Co-authored-by: Alex Cui Co-authored-by: Anand Joseph Co-authored-by: Evelina Co-authored-by: Nikolay Karpov Co-authored-by: Nikolay Karpov Co-authored-by: Peter Plantinga Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com> Signed-off-by: Alex Cui --- Jenkinsfile | 36 +- .../zh/graph_utils.py | 2 + .../inverse_text_normalization/zh/utils.py | 14 + .../zh/data/char/punctuations_zh.tsv | 2 + .../zh/data/math/symbol.tsv | 1 + .../zh/data/measure/units_en.tsv | 25 - .../zh/data/measure/units_zh.tsv | 211 ----- .../zh/data/money/currency_major.tsv | 1 - .../zh/data/number/digit_alt.tsv | 9 + .../zh/data/number/suffix.tsv | 23 + .../zh/data/number/teen.tsv | 10 + .../zh/data/number/teen_alt.tsv | 10 + .../zh/data/number/ties.tsv | 8 + .../text_normalization/zh/data/whitelist.tsv | 7 - .../text_normalization/zh/graph_utils.py | 9 +- .../text_normalization/zh/taggers/cardinal.py | 764 ++++-------------- .../text_normalization/zh/taggers/date.py | 19 +- .../text_normalization/zh/taggers/decimal.py | 28 +- .../text_normalization/zh/taggers/fraction.py | 88 +- .../zh/taggers/math_symbol.py | 45 -- .../text_normalization/zh/taggers/measure.py | 60 +- .../text_normalization/zh/taggers/money.py | 74 +- .../text_normalization/zh/taggers/ordinal.py | 6 +- .../zh/taggers/preprocessor.py | 6 +- .../zh/taggers/punctuation.py | 5 +- .../text_normalization/zh/taggers/time.py | 16 +- .../zh/taggers/tokenize_and_classify.py | 125 +-- .../zh/taggers/whitelist.py | 22 +- .../text_normalization/zh/taggers/word.py | 64 +- .../text_normalization/zh/utils.py | 2 +- .../zh/verbalizers/cardinal.py | 5 +- .../text_normalization/zh/verbalizers/date.py | 20 +- .../zh/verbalizers/decimal.py | 19 +- .../zh/verbalizers/fraction.py | 15 +- .../zh/verbalizers/math_symbol.py | 30 - .../zh/verbalizers/measure.py | 58 +- .../zh/verbalizers/money.py | 8 +- .../zh/verbalizers/ordinal.py | 6 +- .../zh/verbalizers/post_processing.py | 113 +++ .../zh/verbalizers/postprocessor.py | 4 +- .../text_normalization/zh/verbalizers/time.py | 10 +- .../zh/verbalizers/verbalize.py | 26 +- .../zh/verbalizers/verbalize_final.py | 7 +- .../zh/verbalizers/whitelist.py | 4 +- .../text_normalization/zh/verbalizers/word.py | 6 +- .../test_cases_cardinal.txt | 158 +--- .../test_cases_date.txt | 7 +- .../test_cases_decimal.txt | 15 +- .../test_cases_fraction.txt | 20 +- .../test_cases_math.txt | 2 - .../test_cases_measure.txt | 6 + .../test_cases_money.txt | 10 +- .../test_cases_ordinal.txt | 145 +--- .../test_cases_preprocess.txt | 1 - .../test_cases_time.txt | 8 +- .../test_cases_whitelist.txt | 10 + .../test_cases_word.txt | 7 +- tests/nemo_text_processing/zh/test_math.py | 31 - .../zh/test_preprocess.py | 31 - .../zh/test_sparrowhawk_normalization.sh | 46 +- tests/nemo_text_processing/zh/test_time.py | 16 +- .../nemo_text_processing/zh/test_whitelist.py | 73 +- tests/nemo_text_processing/zh/test_word.py | 10 + .../docker/launch.sh | 2 +- .../pynini_export.py | 7 +- 65 files changed, 1006 insertions(+), 1622 deletions(-) delete mode 100644 nemo_text_processing/text_normalization/zh/data/measure/units_zh.tsv create mode 100644 nemo_text_processing/text_normalization/zh/data/number/digit_alt.tsv create mode 100644 nemo_text_processing/text_normalization/zh/data/number/suffix.tsv create mode 100644 nemo_text_processing/text_normalization/zh/data/number/teen.tsv create mode 100644 nemo_text_processing/text_normalization/zh/data/number/teen_alt.tsv create mode 100644 nemo_text_processing/text_normalization/zh/data/number/ties.tsv delete mode 100644 nemo_text_processing/text_normalization/zh/taggers/math_symbol.py delete mode 100644 nemo_text_processing/text_normalization/zh/verbalizers/math_symbol.py create mode 100644 nemo_text_processing/text_normalization/zh/verbalizers/post_processing.py delete mode 100644 tests/nemo_text_processing/zh/data_text_normalization/test_cases_math.txt delete mode 100644 tests/nemo_text_processing/zh/data_text_normalization/test_cases_preprocess.txt create mode 100644 tests/nemo_text_processing/zh/data_text_normalization/test_cases_whitelist.txt delete mode 100644 tests/nemo_text_processing/zh/test_math.py delete mode 100644 tests/nemo_text_processing/zh/test_preprocess.py diff --git a/Jenkinsfile b/Jenkinsfile index 75cdaa17f..6822ee055 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-27-23-0' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-30-24-0' IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-26-23-0' HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' @@ -189,7 +189,7 @@ pipeline { } } - stage('L0: Create RU TN/ITN Grammars & SV & PT & ZH') { + stage('L0: Create RU TN/ITN Grammars & SV & PT') { when { anyOf { branch 'main' @@ -228,16 +228,6 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=pt --text="dez " --cache_dir ${PT_TN_CACHE}' } } - stage('L0: ZH TN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=zh --text="你" --cache_dir ${ZH_TN_CACHE}' - } - } - stage('L0: ZH ITN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=zh --text="二零零二年一月二十八日 " --cache_dir ${ZH_TN_CACHE}' - } - } } } @@ -267,9 +257,31 @@ pipeline { } } } + stage('L0: Create ZH TN/ITN Grammar') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('L0: ZH ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=zh --text="你" --cache_dir ${ZH_TN_CACHE}' + } + } + stage('L0: ZH TN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=zh --text="6" --cache_dir ${ZH_TN_CACHE}' + } + } + } + } // L1 Tests starts here + stage('L1: TN/ITN Tests CPU') { when { anyOf { diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py index 13e8ab6d0..de1a7a28c 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -22,6 +22,8 @@ from pynini.export import export from pynini.lib import byte, pynutil, utf8 +from nemo_text_processing.inverse_text_normalization.zh.utils import load_labels + NEMO_CHAR = utf8.VALID_UTF8_CHAR NEMO_DIGIT = byte.DIGIT NEMO_HEX = pynini.union(*string.hexdigits).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/utils.py b/nemo_text_processing/inverse_text_normalization/zh/utils.py index d63a1b2f7..92336fe0f 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/utils.py @@ -60,3 +60,17 @@ def get_various_formats(text: str) -> List[str]: result.append(t.upper()) result.append(t.capitalize()) return result + + +def load_labels(abs_path): + """ + loads relative path file as dictionary + + Args: + abs_path: absolute path + + Returns dictionary of mappings + """ + with open(abs_path, encoding="utf-8") as label_tsv: + labels = list(csv.reader(label_tsv, delimiter="\t")) + return labels diff --git a/nemo_text_processing/text_normalization/zh/data/char/punctuations_zh.tsv b/nemo_text_processing/text_normalization/zh/data/char/punctuations_zh.tsv index 963b07d12..3848d54f9 100644 --- a/nemo_text_processing/text_normalization/zh/data/char/punctuations_zh.tsv +++ b/nemo_text_processing/text_normalization/zh/data/char/punctuations_zh.tsv @@ -70,3 +70,5 @@ … ‧ ﹏ +< +> \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/zh/data/math/symbol.tsv b/nemo_text_processing/text_normalization/zh/data/math/symbol.tsv index 7f16f52a4..4eff86d48 100644 --- a/nemo_text_processing/text_normalization/zh/data/math/symbol.tsv +++ b/nemo_text_processing/text_normalization/zh/data/math/symbol.tsv @@ -5,3 +5,4 @@ × 乘 ÷ 除 ° 度 +- 减 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/zh/data/measure/units_en.tsv b/nemo_text_processing/text_normalization/zh/data/measure/units_en.tsv index 6d45b4a3b..b1a8a832a 100644 --- a/nemo_text_processing/text_normalization/zh/data/measure/units_en.tsv +++ b/nemo_text_processing/text_normalization/zh/data/measure/units_en.tsv @@ -1,7 +1,5 @@ amu 原子质量 bar 巴 -° 度 -º 度 °c 摄氏度 °C 摄氏度 ºc 摄氏度 @@ -40,23 +38,6 @@ kw 千瓦 kW 千瓦 lb 磅 lbs 磅 -m2 平方米 -m² 平方米 -m3 立方米 -m³ 立方米 -mbps 兆比特每秒 -mg 毫克 -mhz 兆赫兹 -mi2 平方英里 -mi² 平方英里 -mi 英里 -min 分钟哦 -ml 毫升 -mm2 平方毫米 -mm² 平方毫米 -mol 摩尔 -mpa 兆帕 -mph 英里每小时 ng 纳克 nm 纳米 ns 纳秒 @@ -80,13 +61,7 @@ gb 吉字节 gpa 吉帕斯卡 gy 戈瑞 ha 公顷 -m 米 -mm 毫米 -ms 毫秒 -mv 毫伏 -mw 毫瓦 pg 皮克 ps 皮秒 s 秒 -ms 毫秒 g 克 diff --git a/nemo_text_processing/text_normalization/zh/data/measure/units_zh.tsv b/nemo_text_processing/text_normalization/zh/data/measure/units_zh.tsv deleted file mode 100644 index 5ca1dd9ab..000000000 --- a/nemo_text_processing/text_normalization/zh/data/measure/units_zh.tsv +++ /dev/null @@ -1,211 +0,0 @@ -匹 -张 -座 -回 -场 -尾 -条 -个 -首 -阙 -阵 -网 -炮 -顶 -丘 -棵 -只 -支 -袭 -辆 -挑 -担 -颗 -壳 -窠 -曲 -墙 -群 -腔 -砣 -座 -客 -贯 -扎 -捆 -刀 -令 -打 -手 -罗 -坡 -山 -岭 -江 -溪 -钟 -队 -单 -双 -对 -口 -头 -脚 -板 -跳 -枝 -件 -贴 -针 -线 -管 -名 -位 -身 -堂 -课 -本 -页 -家 -户 -层 -丝 -毫 -厘 -分 -钱 -两 -斤 -担 -铢 -石 -钧 -锱 -忽 -克 -毫 -厘 -分 -寸 -尺 -丈 -里 -寻 -常 -铺 -程 -米 -撮 -勺 -合 -升 -斗 -石 -盘 -碗 -碟 -叠 -桶 -笼 -盆 -盒 -杯 -钟 -斛 -锅 -簋 -篮 -盘 -桶 -罐 -瓶 -壶 -卮 -盏 -箩 -箱 -煲 -啖 -袋 -钵 -年 -月 -日 -季 -刻 -时 -周 -天 -秒 -分 -旬 -纪 -岁 -世 -更 -夜 -春 -夏 -秋 -冬 -代 -伏 -辈 -丸 -泡 -粒 -颗 -幢 -堆 -条 -根 -支 -道 -面 -片 -张 -颗 -块 -架 -千米 -分米 -厘米 -毫米 -微米 -纳米 -亿 -千万 -百万 -万 -千 -百 -亿块 -千万块 -百万块 -万块 -千块 -百块 -亿角 -千万角 -百万角 -万角 -千角 -百角 -亿毛 -千万毛 -百万毛 -万毛 -千毛 -百毛 -亿分 -千万分 -百万分 -万分 -千分 -百分 -亿元 -千万元 -百万元 -万元 -千元 -百元 diff --git a/nemo_text_processing/text_normalization/zh/data/money/currency_major.tsv b/nemo_text_processing/text_normalization/zh/data/money/currency_major.tsv index 88e6cc544..b80833507 100644 --- a/nemo_text_processing/text_normalization/zh/data/money/currency_major.tsv +++ b/nemo_text_processing/text_normalization/zh/data/money/currency_major.tsv @@ -168,7 +168,6 @@ Ft 匈牙利福林 ₪ 以色列谢克尔 J$ 牙买加元 лв 哈萨克斯坦腾格 -₩ 朝鲜园 лв 吉尔吉斯斯坦索姆 ₭ 老挝基普 ден 马其顿代纳尔 diff --git a/nemo_text_processing/text_normalization/zh/data/number/digit_alt.tsv b/nemo_text_processing/text_normalization/zh/data/number/digit_alt.tsv new file mode 100644 index 000000000..b949b9508 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/number/digit_alt.tsv @@ -0,0 +1,9 @@ +1 一 +2 两 +3 三 +4 四 +5 五 +6 六 +7 七 +8 八 +9 九 diff --git a/nemo_text_processing/text_normalization/zh/data/number/suffix.tsv b/nemo_text_processing/text_normalization/zh/data/number/suffix.tsv new file mode 100644 index 000000000..f44c0e151 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/number/suffix.tsv @@ -0,0 +1,23 @@ +万 +十万 +百万 +千万 +亿 +十亿 +百亿 +千亿 +萬 +十萬 +百萬 +千萬 +億 +十億 +百億 +千億 +拾萬 +佰萬 +仟萬 +拾億 +佰億 +仟億 + diff --git a/nemo_text_processing/text_normalization/zh/data/number/teen.tsv b/nemo_text_processing/text_normalization/zh/data/number/teen.tsv new file mode 100644 index 000000000..52dc01917 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/number/teen.tsv @@ -0,0 +1,10 @@ +10 十 +11 十一 +12 十二 +13 十三 +14 十四 +15 十五 +16 十六 +17 十七 +18 十八 +19 十九 diff --git a/nemo_text_processing/text_normalization/zh/data/number/teen_alt.tsv b/nemo_text_processing/text_normalization/zh/data/number/teen_alt.tsv new file mode 100644 index 000000000..a48662621 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/number/teen_alt.tsv @@ -0,0 +1,10 @@ +10 一十 +11 一十一 +12 一十二 +13 一十三 +14 一十四 +15 一十五 +16 一十六 +17 一十七 +18 一十八 +19 一十九 diff --git a/nemo_text_processing/text_normalization/zh/data/number/ties.tsv b/nemo_text_processing/text_normalization/zh/data/number/ties.tsv new file mode 100644 index 000000000..2a73c0399 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/number/ties.tsv @@ -0,0 +1,8 @@ +2 二十 +3 三十 +4 四十 +5 五十 +6 六十 +7 七十 +8 八十 +9 九十 diff --git a/nemo_text_processing/text_normalization/zh/data/whitelist.tsv b/nemo_text_processing/text_normalization/zh/data/whitelist.tsv index 133143950..e8810f42a 100644 --- a/nemo_text_processing/text_normalization/zh/data/whitelist.tsv +++ b/nemo_text_processing/text_normalization/zh/data/whitelist.tsv @@ -77,12 +77,6 @@ C C t v CCTV kfc KFC K F C KFC Steam steam -phd 博士 -PhD 博士 -Dr. 医生 -Mr. 先生 -Mrs. 女士 -Ms. 小姐 O 2 O O to O O2O O to O P 2 P P to P @@ -161,4 +155,3 @@ cctv CCTV C C t v CCTV kfc KFC K F C KFC -Steam steam diff --git a/nemo_text_processing/text_normalization/zh/graph_utils.py b/nemo_text_processing/text_normalization/zh/graph_utils.py index 20e7532b6..f2ad527ae 100644 --- a/nemo_text_processing/text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/text_normalization/zh/graph_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -41,13 +41,18 @@ NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() - NEMO_SIGMA = pynini.closure(NEMO_CHAR) +NEMO_NOT_ALPHA = pynini.difference(NEMO_SIGMA, NEMO_ALPHA).optimize() +NEMO_SPACE_CHAR = pynini.union(NEMO_CHAR, NEMO_SPACE) delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) insert_space = pynutil.insert(" ") delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"")) +) def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): diff --git a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py index 3756ba6c8..21437e82f 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,617 +16,177 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_DIGIT, GraphFst from nemo_text_processing.text_normalization.zh.utils import get_abs_path class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. - '23' -> cardinal { integer: "二十三" } - -10000 -> cardinal { negative: "负" integer: "一万" } - +10000 -> cardinal { positive: "正" integer: "一万" } + Finite state transducer for classifying cardinals + e.g. 23 -> cardinal { integer: "二十三" } """ - def __init__(self, deterministic: bool = True, lm: bool = False): + def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) + graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) + graph_digit_alt = pynini.string_file(get_abs_path("data/number/digit_alt.tsv")) + graph_ties = pynini.string_file(get_abs_path("data/number/ties.tsv")) + graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv")) + graph_teen_alt = pynini.string_file(get_abs_path("data/number/teen_alt.tsv")) + + alls = NEMO_DIGIT ** 2 | NEMO_DIGIT ** 1 + graph_all = ( + (graph_ties + (graph_digit | pynutil.delete('0'))) | graph_teen_alt | graph_digit + ) # graph_all when within a larger number e.g., 316-> 三百一十六 instead of 三百十六 + + graph_all = alls @ graph_all + graph_all_alt = ( + (graph_ties + (graph_digit | pynutil.delete('0'))) | graph_teen | graph_digit + ) # graph_all when at the head of the larger numbere.g., 13万 -> 十三万 instead of 一十三万 + graph_all_alt = alls @ graph_all_alt + + hundreds = NEMO_DIGIT ** 3 + graph_hundred_component = (graph_digit + pynutil.insert('百')) + pynini.union( + pynini.closure(pynutil.delete('0')), + (pynini.closure(pynutil.delete('0') + pynutil.insert('零')) + graph_all), + ) + graph_hundred = hundreds @ graph_hundred_component + + self.digit = graph_digit.optimize() + self.all = graph_all.optimize() + + thousands = NEMO_DIGIT ** 4 + graph_thousand_component = (graph_digit_alt + pynutil.insert('千')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_thousand = thousands @ graph_thousand_component + + ten_thousands = NEMO_DIGIT ** 5 + graph_ten_thousand_component = (graph_digit_alt + pynutil.insert('万')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + pynutil.insert('零') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_ten_thousand = ten_thousands @ graph_ten_thousand_component + + hundred_thousands = NEMO_DIGIT ** 6 + hundred_thousands_position = NEMO_DIGIT ** 2 + hundred_thousands_position = hundred_thousands_position @ graph_all_alt + graph_hundred_thousand_component = (hundred_thousands_position + pynutil.insert('万')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + pynutil.insert('零') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component + + millions = NEMO_DIGIT ** 7 + million_position = NEMO_DIGIT ** 3 + million_position = million_position @ graph_hundred_component + graph_million_component = (million_position + pynutil.insert('万')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + pynutil.insert('零') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_million = millions @ graph_million_component + + ten_millions = NEMO_DIGIT ** 8 + ten_million_position = NEMO_DIGIT ** 4 + ten_million_position = ten_million_position @ graph_thousand_component + graph_ten_million_component = (ten_million_position + pynutil.insert('万')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + pynutil.insert('零') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_ten_million = ten_millions @ graph_ten_million_component + + hundred_millions = NEMO_DIGIT ** 9 + graph_hundred_million_component = (graph_digit_alt + pynutil.insert('亿')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + pynutil.insert('零') + graph_million_component), + (pynutil.delete('00') + pynutil.insert('零') + graph_hundred_thousand_component), + (pynutil.delete('000') + pynutil.insert('零') + graph_ten_thousand_component), + (pynutil.delete('0000') + pynutil.insert('零') + graph_thousand_component), + (pynutil.delete('00000') + pynutil.insert('零') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_hundred_million = hundred_millions @ graph_hundred_million_component + + thousand_millions = NEMO_DIGIT ** 10 + thousand_millions_position = NEMO_DIGIT ** 2 + thousand_millions_position = thousand_millions_position @ graph_all_alt + graph_thousand_million_component = (thousand_millions_position + pynutil.insert('亿')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + pynutil.insert('零') + graph_million_component), + (pynutil.delete('00') + pynutil.insert('零') + graph_hundred_thousand_component), + (pynutil.delete('000') + pynutil.insert('零') + graph_ten_thousand_component), + (pynutil.delete('0000') + pynutil.insert('零') + graph_thousand_component), + ((pynutil.delete('00000') + pynutil.insert('零') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_thousand_million = thousand_millions @ graph_thousand_million_component + + ten_billions = NEMO_DIGIT ** 11 + ten_billions_position = NEMO_DIGIT ** 3 + ten_billions_position = ten_billions_position @ graph_hundred_component + graph_ten_billions_component = (ten_billions_position + pynutil.insert('亿')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + pynutil.insert('零') + graph_million_component), + (pynutil.delete('00') + pynutil.insert('零') + graph_hundred_thousand_component), + (pynutil.delete('000') + pynutil.insert('零') + graph_ten_thousand_component), + (pynutil.delete('0000') + pynutil.insert('零') + graph_thousand_component), + ((pynutil.delete('00000') + pynutil.insert('零') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_ten_billions = ten_billions @ graph_ten_billions_component + + hundred_billions = NEMO_DIGIT ** 12 + hundred_billions_position = NEMO_DIGIT ** 4 + hundred_billions_position = hundred_billions_position @ graph_thousand_component + graph_hundred_billions_component = (hundred_billions_position + pynutil.insert('亿')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + pynutil.insert('零') + graph_million_component), + (pynutil.delete('00') + pynutil.insert('零') + graph_hundred_thousand_component), + (pynutil.delete('000') + pynutil.insert('零') + graph_ten_thousand_component), + (pynutil.delete('0000') + pynutil.insert('零') + graph_thousand_component), + ((pynutil.delete('00000') + pynutil.insert('零') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_hundred_billions = hundred_billions @ graph_hundred_billions_component - # imports - zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) - digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) - digit_tens = pynini.string_file(get_abs_path("data/number/digit_tens.tsv")) - - # morphemes inserted + punctuation - tens_digit = pynutil.insert('十') - hundred_digit = pynutil.insert('百') - thousand_digit = pynutil.insert('千') - tenthousand_digit = pynutil.insert('万') - hundredmillion_digit = pynutil.insert('亿') - delete_punct = pynini.closure(pynutil.delete(',') | pynutil.delete(',')) - - # 十几; 10-19 - graph_teen = ( - pynini.closure(delete_punct) - + pynini.cross('1', '十') - + ( - (pynini.closure(delete_punct) + (pynini.closure(delete_punct) + digit)) - | (pynini.closure(delete_punct) + pynini.cross('0', '')) - ) - ) - - # 十几; 10-19 but when not alone, but within a larger number, (e.g, 119) - graph_teen_alt = ( - (pynini.closure(delete_punct) + (pynini.cross('1', '一十') + pynini.closure(delete_punct) + digit)) - | (pynini.closure(delete_punct) + pynini.cross('10', '一十')) - | (pynini.closure(delete_punct) + (pynini.cross('1,0', '一十') | pynini.cross('1,0', '一十'))) - ) # when the teen is not by itself but with in a larger number - - # 几十; 20-99 - graph_tens = ( - pynini.closure(delete_punct) - + (digit_tens + tens_digit + pynini.closure(delete_punct) + ((pynini.closure(delete_punct) + digit))) - ) | ( - digit_tens + tens_digit + (pynini.closure(delete_punct) + (pynini.cross('0', '') | pynini.cross(',0', ''))) - ) - - # 百; 100-999; hundreds - graph_hundred = ( - ( - digit - + ( - pynutil.delete('00') - | (pynutil.delete(',00') | pynutil.delete(',00')) - | (pynutil.delete('0,0') | pynutil.delete('0,0')) - ) - + hundred_digit - ) - | (digit + hundred_digit + (graph_tens | graph_teen_alt)) - | ( - digit - + hundred_digit - + ( - (pynini.cross(',0', '零') | pynini.cross(',0', '零')) - | pynini.cross('0', '零') - | (pynini.cross('0,', '零') | pynini.cross('0,', '零')) - ) - + digit - ) - ) - - # 千; 1000-9999; thousands - graph_thousand = ( - ( - digit - + ( - (pynutil.delete(',000') | pynutil.delete('000') | pynutil.delete('0,00') | pynutil.delete('00,0')) - | ( - pynutil.delete(',000') - | pynutil.delete('000') - | pynutil.delete('0,00') - | pynutil.delete('00,0') - ) - ) - + thousand_digit - ) - | (digit + pynini.closure(delete_punct) + thousand_digit + graph_hundred) - | ( - digit - + thousand_digit - + (pynini.cross('0', '零') | ((pynini.cross(',0', '零') | pynini.cross(',0', '零')))) - + (graph_tens | graph_teen_alt) - ) - | ( - digit - + pynini.closure(delete_punct) - + thousand_digit - + ( - pynini.cross('00', '零') - | (pynini.cross(',00', '零') | pynini.cross(',00', '零')) - | (pynini.cross('0,0', '零') | pynini.cross('0,0', '零')) - | (pynini.cross('00,', '零') | pynini.cross('00,', '零')) - ) - + digit - ) - ) - - # 万; 10000-99999; ten thousands - graph_tenthousand = ( - ( - digit - + (pynutil.delete('0000') | (pynutil.delete('0,000') | pynutil.delete('0,000'))) - + tenthousand_digit - ) - | (digit + tenthousand_digit + graph_thousand) - | ( - digit - + tenthousand_digit - + (pynini.cross('0', '零') | (pynini.cross('0,', '零') | pynini.cross('0,', '零'))) - + graph_hundred - ) - | ( - digit - + tenthousand_digit - + (pynini.cross('00', '零') | (pynini.cross('0,0', '零') | pynini.cross('0,0', '零'))) - + (graph_tens | graph_teen_alt) - ) - | ( - digit - + tenthousand_digit - + (pynini.cross('000', '零') | (pynini.cross('0,00', '零') | pynini.cross('0,00', '零'))) - + digit - ) - ) - - # 十万; 100000-999999; hundred thousands - graph_hundredthousand = ( - pynutil.add_weight( - ( - (graph_tens | graph_teen) - + tenthousand_digit - + (pynutil.delete('0000') | (pynutil.delete('0,000') | pynutil.delete('0,000'))) - ), - -0.1, - ) - | ((graph_tens | graph_teen) + tenthousand_digit + graph_thousand) - | ( - (graph_tens | graph_teen) - + tenthousand_digit - + (pynini.cross('0', '零') | (pynini.cross('0,', '零') | pynini.cross('0,', '零'))) - + graph_hundred - ) - | ( - (graph_tens | graph_teen) - + tenthousand_digit - + (pynini.cross('00', '零') | (pynini.cross('0,0', '零') | pynini.cross('0,0', '零'))) - + (graph_tens | graph_teen_alt) - ) - | ( - (graph_tens | graph_teen) - + tenthousand_digit - + (pynini.cross('000', '零') | (pynini.cross('0,00', '零') | pynini.cross('0,00', '零'))) - + digit - ) - ) - - # 百万; 1000000-9999999; millions - graph_million = ( - pynutil.add_weight( - ( - graph_hundred - + tenthousand_digit - + (pynutil.delete('0000') | (pynutil.delete('0,000') | pynutil.delete('0,000'))) - ), - -1.0, - ) - | (graph_hundred + tenthousand_digit + graph_thousand) - | ( - graph_hundred - + tenthousand_digit - + (pynini.cross('0', '零') | (pynini.cross('0,', '零') | pynini.cross('0,', '零'))) - + graph_hundred - ) - | ( - graph_hundred - + tenthousand_digit - + (pynini.cross('00', '零') | (pynini.cross('0,0', '零') | pynini.cross('0,0', '零'))) - + (graph_tens | graph_teen_alt) - ) - | ( - graph_hundred - + tenthousand_digit - + (pynini.cross('000', '零') | (pynini.cross('0,00', '零') | pynini.cross('0,00', '零'))) - + digit - ) - ) - - # 千万; 10000000-99999999; ten millions - graph_tenmillion = ( - pynutil.add_weight( - ( - graph_thousand - + (pynutil.delete('0000') | (pynutil.delete('0,000') | pynutil.delete('0,000'))) - + tenthousand_digit - ), - -1.0, - ) - | (graph_thousand + tenthousand_digit + graph_thousand) - | ( - graph_thousand - + tenthousand_digit - + (pynini.cross('0', '零') | (pynini.cross('0,', '零') | pynini.cross('0,', '零'))) - + graph_hundred - ) - | ( - graph_thousand - + tenthousand_digit - + (pynini.cross('00', '零') | (pynini.cross('0,0', '零') | pynini.cross('0,0', '零'))) - + (graph_tens | graph_teen_alt) - ) - | ( - graph_thousand - + tenthousand_digit - + (pynini.cross('000', '零') | (pynini.cross('0,00', '零') | pynini.cross('0,00', '零'))) - + digit - ) - ) - - # 亿; 100000000-999999999; hundred millions - graph_hundredmillion = ( - pynutil.add_weight( - ( - digit - + (pynutil.delete('00000000') | (pynutil.delete('00,000,000') | pynutil.delete('00,000,000'))) - + hundredmillion_digit - ), - -2.0, - ) - | pynutil.add_weight((digit + hundredmillion_digit + graph_tenmillion), -1.9) - | pynutil.add_weight((digit + hundredmillion_digit + pynutil.delete('0') + graph_million), -1.8) - | pynutil.add_weight( - (digit + hundredmillion_digit + pynutil.delete('00') + pynutil.insert('零') + graph_hundredthousand), - -1.7, - ) - | pynutil.add_weight( - ( - digit - + hundredmillion_digit - + (pynutil.delete('000') | (pynutil.delete('00,0') | pynutil.delete('00,0'))) - + pynutil.insert('零') - + graph_tenthousand - ), - -1.6, - ) - | pynutil.add_weight( - ( - digit - + hundredmillion_digit - + (pynutil.delete('0000') | (pynutil.delete('00,00') | pynutil.delete('00,00'))) - + pynutil.insert('零') - + graph_thousand - ), - -1.5, - ) - | pynutil.add_weight( - ( - digit - + hundredmillion_digit - + (pynutil.delete('00000') | (pynutil.delete('00,000,') | pynutil.delete('00,000,'))) - + pynutil.insert('零') - + graph_hundred - ), - -1.4, - ) - | pynutil.add_weight( - ( - digit - + hundredmillion_digit - + (pynutil.delete('000000') | (pynutil.delete('00,000,0') | pynutil.delete('00,000,0'))) - + pynutil.insert('零') - + (graph_tens | graph_teen_alt) - ), - -1.3, - ) - | pynutil.add_weight( - ( - digit - + hundredmillion_digit - + (pynutil.delete('0000000') | (pynutil.delete('00,000,00') | pynutil.delete('00,000,00'))) - + pynutil.insert('零') - + digit - ), - -1.2, - ) - ) - - # 十亿; 1000000000-9999999999; billions - graph_billion = ( - pynutil.add_weight( - ( - (graph_tens | graph_teen) - + (pynutil.delete('00000000') | (pynutil.delete('00,000,000') | pynutil.delete('00,000,000'))) - + hundredmillion_digit - ), - -2.0, - ) - | pynutil.add_weight(((graph_tens | graph_teen) + hundredmillion_digit + graph_tenmillion), -1.9) - | pynutil.add_weight( - ((graph_tens | graph_teen) + hundredmillion_digit + pynutil.delete('0') + graph_million), -1.8 - ) - | pynutil.add_weight( - ( - (graph_tens | graph_teen) - + hundredmillion_digit - + pynutil.delete('00') - + pynutil.insert('零') - + graph_hundredthousand - ), - -1.7, - ) - | pynutil.add_weight( - ( - (graph_tens | graph_teen) - + hundredmillion_digit - + (pynutil.delete('000') | (pynutil.delete('00,0') | pynutil.delete('00,0'))) - + pynutil.insert('零') - + graph_tenthousand - ), - -1.6, - ) - | pynutil.add_weight( - ( - (graph_tens | graph_teen) - + hundredmillion_digit - + (pynutil.delete('0000') | (pynutil.delete('00,00') | pynutil.delete('00,00'))) - + pynutil.insert('零') - + graph_thousand - ), - -1.5, - ) - | pynutil.add_weight( - ( - (graph_tens | graph_teen) - + hundredmillion_digit - + (pynutil.delete('00000') | (pynutil.delete('00,000,') | pynutil.delete('00,000,'))) - + pynutil.insert('零') - + graph_hundred - ), - -1.4, - ) - | pynutil.add_weight( - ( - (graph_tens | graph_teen) - + hundredmillion_digit - + (pynutil.delete('000000') | (pynutil.delete('00,000,0') | pynutil.delete('00,000,0'))) - + pynutil.insert('零') - + (graph_tens | graph_teen_alt) - ), - -1.3, - ) - | pynutil.add_weight( - ( - (graph_tens | graph_teen) - + hundredmillion_digit - + (pynutil.delete('0000000') | (pynutil.delete('00,000,00') | pynutil.delete('00,000,00'))) - + pynutil.insert('零') - + digit - ), - -1.2, - ) - ) - - # 百亿; 10000000000-99999999999; ten billions - graph_tenbillion = ( - pynutil.add_weight( - ( - graph_hundred - + (pynutil.delete('00000000') | (pynutil.delete('00,000,000') | pynutil.delete('00,000,000'))) - + hundredmillion_digit - ), - -2.0, - ) - | pynutil.add_weight((graph_hundred + hundredmillion_digit + graph_tenmillion), -1.9) - | pynutil.add_weight((graph_hundred + hundredmillion_digit + pynutil.delete('0') + graph_million), -1.8) - | pynutil.add_weight( - ( - graph_hundred - + hundredmillion_digit - + pynutil.delete('00') - + pynutil.insert('零') - + graph_hundredthousand - ), - -1.7, - ) - | pynutil.add_weight( - ( - graph_hundred - + hundredmillion_digit - + (pynutil.delete('000') | (pynutil.delete('00,0') | pynutil.delete('00,0'))) - + pynutil.insert('零') - + graph_tenthousand - ), - -1.6, - ) - | pynutil.add_weight( - ( - graph_hundred - + hundredmillion_digit - + (pynutil.delete('0000') | (pynutil.delete('00,00') | pynutil.delete('00,00'))) - + pynutil.insert('零') - + graph_thousand - ), - -1.5, - ) - | pynutil.add_weight( - ( - graph_hundred - + hundredmillion_digit - + (pynutil.delete('00000') | (pynutil.delete('00,000,') | pynutil.delete('00,000,'))) - + pynutil.insert('零') - + graph_hundred - ), - -1.4, - ) - | pynutil.add_weight( - ( - graph_hundred - + hundredmillion_digit - + (pynutil.delete('000000') | (pynutil.delete('00,000,0') | pynutil.delete('00,000,0'))) - + pynutil.insert('零') - + (graph_tens | graph_teen_alt) - ), - -1.3, - ) - | pynutil.add_weight( - ( - graph_hundred - + hundredmillion_digit - + (pynutil.delete('0000000') | (pynutil.delete('00,000,00') | pynutil.delete('00,000,00'))) - + pynutil.insert('零') - + digit - ), - -1.2, - ) - ) - - # 千亿; 100000000000-999999999999; hundred billions - graph_hundredbillion = ( - pynutil.add_weight( - ( - graph_thousand - + hundredmillion_digit - + (pynutil.delete('00000000') | (pynutil.delete('00,000,000') | pynutil.delete('00,000,000'))) - ), - -2.0, - ) - | pynutil.add_weight((graph_thousand + hundredmillion_digit + graph_tenmillion), -1.9) - | pynutil.add_weight((graph_thousand + hundredmillion_digit + pynutil.delete('0') + graph_million), -1.8) - | pynutil.add_weight( - ( - graph_thousand - + hundredmillion_digit - + pynutil.delete('00') - + pynutil.insert('零') - + graph_hundredthousand - ), - -1.7, - ) - | pynutil.add_weight( - ( - graph_thousand - + hundredmillion_digit - + (pynutil.delete('000') | (pynutil.delete('00,0') | pynutil.delete('00,0'))) - + pynutil.insert('零') - + graph_tenthousand - ), - -1.6, - ) - | pynutil.add_weight( - ( - graph_thousand - + hundredmillion_digit - + (pynutil.delete('0000') | (pynutil.delete('00,00') | pynutil.delete('00,00'))) - + pynutil.insert('零') - + graph_thousand - ), - -1.5, - ) - | pynutil.add_weight( - ( - graph_thousand - + hundredmillion_digit - + (pynutil.delete('00000') | (pynutil.delete('00,000,') | pynutil.delete('00,000,'))) - + pynutil.insert('零') - + graph_hundred - ), - -1.4, - ) - | pynutil.add_weight( - ( - graph_thousand - + hundredmillion_digit - + (pynutil.delete('000000') | (pynutil.delete('00,000,0') | pynutil.delete('00,000,0'))) - + pynutil.insert('零') - + (graph_tens | graph_teen_alt) - ), - -1.3, - ) - | pynutil.add_weight( - ( - graph_thousand - + hundredmillion_digit - + (pynutil.delete('0000000') | (pynutil.delete('00,000,00') | pynutil.delete('00,000,00'))) - + pynutil.insert('零') - + digit - ), - -1.2, - ) - ) - - suffix = pynini.union( - "万", - "十万", - "百万", - "千万", - "亿", - "十亿", - "百亿", - "千亿", - "萬", - "十萬", - "百萬", - "千萬", - "億", - "十億", - "百億", - "千億", - "拾萬", - "佰萬", - "仟萬", - "拾億", - "佰億", - "仟億", - "拾万", - "佰万", - "仟万", - "仟亿", - "佰亿", - "仟亿", - "万亿", - "萬億", - ) - graph_mandarin = pynini.closure( - ( - ( - digit - | graph_teen - | graph_tens - | graph_hundred - | graph_thousand - | graph_tenthousand - | graph_hundredthousand - ) - + suffix - ) - ) - - # combining all the graph above graph = pynini.union( - pynutil.add_weight(graph_hundredbillion, -2.0), - pynutil.add_weight(graph_tenbillion, -1.9), - pynutil.add_weight(graph_billion, -1.8), - pynutil.add_weight(graph_hundredmillion, -1.7), - pynutil.add_weight(graph_tenmillion, -1.6), - pynutil.add_weight(graph_million, -1.5), - pynutil.add_weight(graph_hundredthousand, -1.4), - pynutil.add_weight(graph_tenthousand, -1.3), - pynutil.add_weight(graph_thousand, -1.2), - pynutil.add_weight(graph_hundred, -1.1), - pynutil.add_weight(graph_tens, -1.0), - graph_teen, - digit, - zero, - ) - - # adding optional +(正)/-(负) signs - graph_sign = ( - (pynutil.insert("positive: \"") + pynini.accep("正") + pynutil.insert("\"")) - | (pynutil.insert("negative: \"") + pynini.accep("负") + pynutil.insert("\"")) - | (pynutil.insert("negative: \"") + pynini.cross("負", "负") + pynutil.insert("\"")) - | (pynutil.insert("negative: \"") + pynini.cross("-", "负") + pynutil.insert("\"")) - | (pynutil.insert("positive: \"") + pynini.cross("+", "正") + pynutil.insert("\"")) - ) - - graph_mandarin_sign = graph_sign + pynutil.insert(" ") + graph_mandarin - # final graph - final_graph_sign = ( - graph_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") - ) - final_graph_numbers_only = pynutil.insert("integer: \"") + graph + pynutil.insert("\"") - # imprted when building other grammars - self.just_cardinals = graph | graph_mandarin | final_graph_sign | graph_mandarin_sign - graph_mandarins = pynutil.insert("integer: \"") + graph_mandarin + pynutil.insert("\"") - - final_graph = final_graph_numbers_only | final_graph_sign | graph_mandarins | graph_mandarin_sign + graph_hundred_billions, + graph_ten_billions, + graph_thousand_million, + graph_hundred_million, + graph_ten_million, + graph_million, + graph_hundred_thousand, + graph_ten_thousand, + graph_thousand, + graph_hundred, + graph_all_alt, + graph_zero, + ) + self.just_cardinals = graph.optimize() + optional_sign = ( + pynutil.insert("negative: \"") + (pynini.accep("-") | pynini.cross("负", "-")) + pynutil.insert("\"") + ) + final_graph = ( + optional_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") + ) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\"")) + + self.with_sign = final_graph.optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/taggers/date.py b/nemo_text_processing/text_normalization/zh/taggers/date.py index 92fbfce4d..607b63511 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/date.py +++ b/nemo_text_processing/text_normalization/zh/taggers/date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -58,12 +58,14 @@ def __init__(self, deterministic: bool = True, lm: bool = False): ) only_month = pynutil.insert("month: \"") + month + pynutil.delete('月') + pynutil.insert("\"") only_day = pynutil.insert("day: \"") + day + delete_day + pynutil.insert("\"") + # gh_1 graph_only_date = only_year | only_month | only_day year_month = only_year + pynutil.insert(' ') + only_month month_day = only_month + pynutil.insert(' ') + only_day - graph_all = only_year + pynutil.insert(' ') + only_month + pynutil.insert(' ') + only_day - graph_combination = year_month | month_day | graph_all + graph_ymd = only_year + pynutil.insert(' ') + only_month + pynutil.insert(' ') + only_day + # gh_2 + graph_combination = year_month | month_day | graph_ymd year_component = ( pynutil.insert("year: \"") @@ -73,8 +75,9 @@ def __init__(self, deterministic: bool = True, lm: bool = False): ) month_component = pynutil.insert("month: \"") + month + delete_sign + pynutil.insert("\"") day_component = pynutil.insert("day: \"") + day + pynutil.insert("\"") + # gp_3 graph_sign = year_component + pynutil.insert(' ') + month_component + pynutil.insert(' ') + day_component - + # gp_1+2+3 graph_all = graph_only_date | graph_sign | graph_combination prefix = ( @@ -86,11 +89,13 @@ def __init__(self, deterministic: bool = True, lm: bool = False): | pynini.accep('纪元前') ) prefix_component = pynutil.insert("era: \"") + prefix + pynutil.insert("\"") - graph_prefix = prefix_component + pynutil.insert(' ') + (pynutil.add_weight(graph_all, -2.0)) + # gp_prefix+(1,2,3) + graph_prefix = prefix_component + pynutil.insert(' ') + (graph_ymd | year_month | only_year) suffix_component = pynutil.insert("era: \"") + suffix + pynutil.insert("\"") - graph_suffix = (pynutil.add_weight(graph_all, -2.0)) + pynutil.insert(' ') + suffix_component - + # gp_suffix +(1,2,3) + graph_suffix = (graph_ymd | year_month | only_year) + pynutil.insert(' ') + suffix_component + # gp_4 graph_affix = graph_prefix | graph_suffix graph_suffix_year = ( diff --git a/nemo_text_processing/text_normalization/zh/taggers/decimal.py b/nemo_text_processing/text_normalization/zh/taggers/decimal.py index 8228777c7..d4afb3fd9 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/zh/taggers/decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -73,19 +73,25 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa super().__init__(name="decimal", kind="classify", deterministic=deterministic) cardinal_before_decimal = cardinal.just_cardinals - cardinal_after_decimal = pynini.string_file(get_abs_path("data/number/digit.tsv")) | pynini.closure( - pynini.cross('0', '零') - ) + cardinal_after_decimal = pynini.string_file(get_abs_path("data/number/digit.tsv")) + zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) + + graph_integer = pynutil.insert('integer_part: \"') + cardinal_before_decimal + pynutil.insert("\"") - decimal_point = pynini.closure(pynutil.delete('.'), 0, 1) - graph_integer = pynutil.insert("integer_part: \"") + cardinal_before_decimal + pynutil.insert("\"") graph_fraction = ( - pynutil.insert("fractional_part: \"") + pynini.closure(cardinal_after_decimal, 1) + pynutil.insert("\"") + pynutil.insert("fractional_part: \"") + + pynini.closure((pynini.closure(cardinal_after_decimal, 1) | (pynini.closure(zero, 1))), 1) + + pynutil.insert("\"") ) - graph_decimal = graph_integer + decimal_point + pynutil.insert(" ") + graph_fraction + graph_decimal = graph_integer + pynutil.delete('.') + pynutil.insert(" ") + graph_fraction + self.regular_decimal = graph_decimal.optimize() graph_sign = ( - (pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"负\"")) + pynutil.insert(" ")) + ( + pynini.closure(pynutil.insert("negative: \"") + pynini.cross("-", "负")) + + pynutil.insert("\"") + + pynutil.insert(" ") + ) ) | ( ( pynutil.insert('negative: ') @@ -98,14 +104,12 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa graph_with_sign = graph_sign + graph_decimal graph_regular = graph_with_sign | graph_decimal - # graph_decimal_quantity = get_quantity(graph_decimal, cardinal.just_cardinals) graph_decimal_quantity = get_quantity(graph_decimal) graph_sign_quantity = graph_sign + graph_decimal_quantity graph_quantity = graph_decimal_quantity | graph_sign_quantity - # final_graph = graph_decimal | graph_sign | graph_decimal_quantity | graph_sign_quantity final_graph = graph_regular | graph_quantity - self.decimal = final_graph + self.decimal = final_graph.optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/taggers/fraction.py b/nemo_text_processing/text_normalization/zh/taggers/fraction.py index 6d68280b5..3f9ce42c7 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/zh/taggers/fraction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst +from nemo_text_processing.text_normalization.zh.utils import get_abs_path class FractionFst(GraphFst): @@ -31,14 +32,15 @@ class FractionFst(GraphFst): 98% -> tokens { fraction { denominator: "百" numerator: "九十八"} } Args: - cardinal: CardinalFst, decimal: DecimalFst + cardinal: CardinalFst, decimal: DecimalFst """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True, lm: bool = False): + def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = False): super().__init__(name="fraction", kind="classify", deterministic=deterministic) graph_cardinals = cardinal.just_cardinals - graph_decimal = decimal.decimal + graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) + graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) slash = pynutil.delete('/') morpheme = pynutil.delete('分之') @@ -75,7 +77,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = "仟亿", ) - integer_component = pynutil.insert("integer_part: \"") + graph_cardinals + pynutil.insert("\"") + integer_component = pynutil.insert('integer_part: \"') + graph_cardinals + pynutil.insert("\"") denominator_component = pynutil.insert("denominator: \"") + graph_cardinals + pynutil.insert("\"") numerator_component = pynutil.insert("numerator: \"") + graph_cardinals + pynutil.insert("\"") @@ -86,7 +88,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = + slash + pynutil.insert(' ') + denominator_component - ) + ) # 5又1/3 + graph_only_slash = numerator_component + slash + pynutil.insert(' ') + denominator_component graph_morpheme = (denominator_component + morpheme + pynutil.insert(' ') + numerator_component) | ( @@ -97,65 +100,68 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = + morpheme + pynutil.insert(' ') + numerator_component - ) + ) # 5又3分之1 graph_with_suffix = ( pynini.closure(pynutil.insert("denominator: \"") + suffix + pynutil.insert("\""), 0, 1) + morpheme + pynutil.insert(' ') + numerator_component - ) + ) # 万分之1 percentage = pynutil.delete('%') - graph_percentage = ( - numerator_component - + percentage - + pynutil.insert(' ') - + pynutil.insert("denominator: \"百") + + graph_decimal = ( + pynutil.insert('integer_part: \"') + + pynini.closure( + graph_cardinals + + pynutil.delete('.') + + pynutil.insert('点') + + pynini.closure((graph_digit | graph_zero), 1) + ) + pynutil.insert("\"") ) + graph_decimal_percentage = pynini.closure( + graph_decimal + percentage + pynutil.insert(' denominator: \"百"'), 1 + ) # 5.6% - graph_hundred = pynutil.delete('100%') + pynutil.insert('numerator: \"百\" denominator: \"百"') + graph_integer_percentage = pynini.closure( + (numerator_component) + percentage + pynutil.insert(' denominator: \"百"'), 1 + ) # 5% - graph_optional_sign = ( - (pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"负\""))) - | (pynini.closure(pynutil.insert("positive: ") + pynini.cross("+", "\"正\""))) - | (pynutil.insert("positive: ") + pynutil.insert("\"") + pynini.accep('正') + pynutil.insert("\"")) - | ( - pynutil.insert('negative: ') - + pynutil.insert("\"") - + (pynini.accep('负') | pynini.cross('負', '负')) - + pynutil.insert("\"") - ) - ) + graph_hundred = pynutil.delete('100%') + pynutil.insert('numerator: \"百\" denominator: \"百"') + # 100% - graph_decimals = ( - graph_decimal - + pynutil.insert(" ") - + percentage - + pynutil.insert("denominator: \"百") + graph_optional_sign = (pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"负\""))) | ( + pynutil.insert('negative: ') + + pynutil.insert("\"") + + (pynini.accep('负') | pynini.cross('負', '负')) + pynutil.insert("\"") ) - graph = ( - graph_with_integer - | graph_only_slash - | graph_morpheme - | graph_with_suffix - | graph_percentage - | graph_decimals - | pynutil.add_weight(graph_hundred, -3.0) + graph = pynini.union( + graph_with_integer, + graph_only_slash, + graph_morpheme, + graph_with_suffix, + graph_decimal_percentage, + graph_integer_percentage, + graph_hundred, ) graph_with_sign = ( (graph_optional_sign + pynutil.insert(" ") + graph_with_integer) | (graph_optional_sign + pynutil.insert(" ") + graph_only_slash) | (graph_optional_sign + pynutil.insert(" ") + graph_morpheme) | (graph_optional_sign + pynutil.insert(" ") + graph_with_suffix) - | (graph_optional_sign + pynutil.insert(" ") + graph_percentage) - | pynutil.add_weight((graph_optional_sign + pynutil.insert(" ") + graph_hundred), -3.0) + | (graph_optional_sign + pynutil.insert(" ") + graph_integer_percentage) + | (graph_optional_sign + pynutil.insert(" ") + graph_decimal_percentage) + | (graph_optional_sign + pynutil.insert(" ") + graph_hundred) ) - final_graph = graph | graph_with_sign + final_graph = graph | pynutil.add_weight(graph_with_sign, -3.0) + + self.just_fractions = graph.optimize() + self.fractions = final_graph.optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/taggers/math_symbol.py b/nemo_text_processing/text_normalization/zh/taggers/math_symbol.py deleted file mode 100644 index d6ae0be9c..000000000 --- a/nemo_text_processing/text_normalization/zh/taggers/math_symbol.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import pynini -from pynini.lib import pynutil - -from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst -from nemo_text_processing.text_normalization.zh.taggers.cardinal import CardinalFst -from nemo_text_processing.text_normalization.zh.utils import get_abs_path - - -class MathSymbol(GraphFst): - ''' - + -> tokens { sign: "加" } - ''' - - def __init__(self, deterministic: bool = True, lm: bool = False): - super().__init__(name="sign", kind="classify", deterministic=deterministic) - ''' - add your sign in data/math/symbol.tsv,this graph just convert sigh to character,you can add more - cases with detailed cases - ''' - score_sign = pynini.string_file(get_abs_path("data/math/score.tsv")) | pynini.string_file( - get_abs_path("data/math/symbol.tsv") - ) - score = ( - pynutil.insert("score: \"") - + pynini.closure(score_sign, 0, 1) - + CardinalFst().just_cardinals - + score_sign - + CardinalFst().just_cardinals - + pynutil.insert("\"") - ) - graph = score - self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/taggers/measure.py b/nemo_text_processing/text_normalization/zh/taggers/measure.py index 3fa61cffe..d7da8f524 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/measure.py +++ b/nemo_text_processing/text_normalization/zh/taggers/measure.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import pynini from pynini.lib import pynutil @@ -18,33 +20,57 @@ from nemo_text_processing.text_normalization.zh.utils import get_abs_path -class Measure(GraphFst): +class MeasureFst(GraphFst): ''' 1kg -> tokens { measure { cardinal { integer: "一" } units: "千克" } } ''' - def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True, lm: bool = False): + def __init__( + self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True, lm: bool = False + ): super().__init__(name="measure", kind="classify", deterministic=deterministic) units_en = pynini.string_file(get_abs_path("data/measure/units_en.tsv")) - units_zh = pynini.string_file(get_abs_path("data/measure/units_zh.tsv")) - graph_cardinal = cardinal.just_cardinals - integer_component = pynutil.insert("integer: \"") + graph_cardinal + pynutil.insert("\"") - unit_component = pynutil.insert("units: \"") + (units_en | units_zh) + pynutil.insert("\"") - graph_cardinal_measure = integer_component + insert_space + unit_component + graph_cardinal = cardinal.with_sign + graph_decimal = decimal.decimal + + # these units ared added due to falures when running Sparrow Hawk tests that "ms" would be processed as "m" and "s" left outside of the tagegr + units = ( + pynini.cross("ms", "毫秒") + | pynini.cross("m²", "平方米") + | pynini.cross("m2", "平方米") + | pynini.cross("m²", "平方米") + | pynini.cross("m³", "立方米") + | pynini.cross("mbps", "兆比特每秒") + | pynini.cross("mg", "毫克") + | pynini.cross("mhz", "兆赫兹") + | pynini.cross("mi2", "平方英里") + | pynini.cross("mi²", "平方英里") + | pynini.cross("mi", "英里") + | pynini.cross("min", "分钟") + | pynini.cross("ml", "毫升") + | pynini.cross("mm2", "平方毫米") + | pynini.cross("mm²", "平方毫米") + | pynini.cross("mol", "摩尔") + | pynini.cross("mpa", "兆帕") + | pynini.cross("mph", "英里每小时") + | pynini.cross("mm", "毫米") + | pynini.cross("mv", "毫伏") + | pynini.cross("mw", "毫瓦") + ) + + unit_component = pynutil.insert("units: \"") + (units_en | units) + pynutil.insert("\"") - decimal = decimal.decimal - graph_decimal = ( - decimal + insert_space + pynutil.insert("units: \"") + (units_en | units_zh) + pynutil.insert("\"") + graph_cardinal_measure = pynini.closure( + (pynutil.insert("cardinal { ") + graph_cardinal + pynutil.insert(" } ") + insert_space + unit_component), 1 ) - graph_sign = ( - (pynutil.insert("negative: \"") + pynini.accep("负") + pynutil.insert("\"")) - | (pynutil.insert("negative: \"") + pynini.cross("負", "负") + pynutil.insert("\"")) - | (pynutil.insert("negative: \"") + pynini.cross("-", "负") + pynutil.insert("\"")) + graph_decimal_measure = pynini.closure( + (pynutil.insert("decimal { ") + graph_decimal + pynutil.insert(" } ") + unit_component), 1 ) - graph = pynini.closure(graph_sign + insert_space) + (graph_cardinal_measure | graph_decimal) + graph_measures = graph_decimal_measure | graph_cardinal_measure - self.fst = self.add_tokens(graph).optimize() + final_graph = self.add_tokens(graph_measures) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 93fa59e61..786319627 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,6 +19,40 @@ from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.text_normalization.zh.utils import get_abs_path +# def get_quantity(decimal): +suffix = pynini.union( + "万", + "十万", + "百万", + "千万", + "亿", + "十亿", + "百亿", + "千亿", + "萬", + "十萬", + "百萬", + "千萬", + "億", + "十億", + "百億", + "千億", + "拾萬", + "佰萬", + "仟萬", + "拾億", + "佰億", + "仟億", + "拾万", + "佰万", + "仟万", + "仟亿", + "佰亿", + "仟亿", + "万亿", + "萬億", +) + class MoneyFst(GraphFst): """ @@ -27,18 +61,19 @@ class MoneyFst(GraphFst): '23美元' -> money { integer: "二十三" currency: "美元" } """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True, lm: bool = False): + def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = False): super().__init__(name="money", kind="classify", deterministic=deterministic) cardinal = cardinal.just_cardinals - decimal = decimal.decimal currency = pynini.string_file(get_abs_path("data/money/currency_major.tsv")) currency_mandarin = pynini.string_file(get_abs_path("data/money/currency_mandarin.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) + graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) # regular money gramamr with currency symbols $1000 currency_component = pynutil.insert("currency: \"") + currency + pynutil.insert("\"") - number_component = pynutil.insert("integer: \"") + cardinal + pynutil.insert("\"") + number_component = pynutil.insert("integer_part: \"") + (cardinal | (cardinal + suffix)) + pynutil.insert("\"") graph_regular_money = currency_component + pynutil.insert(" ") + number_component # 块 元 毛 with optional symbols @@ -54,8 +89,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = currency_mandarin_component = pynutil.insert("currency: \"") + currency_mandarin + pynutil.insert("\"") unit_components = ( (pynutil.insert("currency: \"") + unit_major + pynutil.insert("\"")) - | (pynutil.insert("currency_major: \"") + unit_minor + pynutil.insert("\"")) - | (pynutil.insert("currency_minor: \"") + unit_minor_alt + pynutil.insert("\"")) + | (pynutil.insert("currency_maj: \"") + unit_minor + pynutil.insert("\"")) + | (pynutil.insert("currency_min: \"") + unit_minor_alt + pynutil.insert("\"")) ) graph_unit_only = ( @@ -70,12 +105,33 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = graph_mandarin_money = number_component + pynutil.insert(" ") + currency_mandarin_component # larger money as decimals - graph_decimal_money = (decimal + pynutil.insert(" ") + currency_mandarin_component) | ( - currency_component + pynutil.insert(" ") + decimal + graph_decimal = ( + pynutil.insert('integer_part: \"') + + pynini.closure( + pynini.closure(cardinal, 1) + + pynutil.delete('.') + + pynutil.insert('点') + + pynini.closure((graph_digit | graph_zero), 1) + ) + + pynutil.insert("\"") + ) + graph_decimal_money = ( + pynini.closure(graph_decimal, 1) + + pynini.closure(pynutil.insert(' quantity: \"') + suffix + pynutil.insert('\"')) + + pynutil.insert(" ") + + pynini.closure(currency_mandarin_component, 1) + ) | ( + pynini.closure(currency_component, 1) + + pynutil.insert(" ") + + pynini.closure(graph_decimal, 1) + + pynini.closure(pynutil.insert(" ") + pynutil.insert('quantity: \"') + suffix + pynutil.insert('\"')) ) graph = ( - graph_regular_money | graph_units | pynutil.add_weight(graph_mandarin_money, -3.0) | graph_decimal_money + graph_regular_money + | graph_units + | pynutil.add_weight(graph_mandarin_money, -3.0) + | pynutil.add_weight(graph_decimal_money, -1.0) ) final_graph = graph diff --git a/nemo_text_processing/text_normalization/zh/taggers/ordinal.py b/nemo_text_processing/text_normalization/zh/taggers/ordinal.py index 258a9068c..e09dd8047 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/zh/taggers/ordinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,8 +28,8 @@ class OrdinalFst(GraphFst): cardinal: CardinalFst """ - def __init__(self, cardinal: GraphFst): - super().__init__(name="ordinal", kind="verbalize") + def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = False): + super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) graph_cardinal = cardinal.just_cardinals morpheme = pynini.accep('第') diff --git a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py index df612fd8d..82e1c174f 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py +++ b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import pynini from pynini.lib import pynutil @@ -18,7 +20,7 @@ from nemo_text_processing.text_normalization.zh.utils import get_abs_path -class PreProcessor(GraphFst): +class PreProcessorFst(GraphFst): ''' Preprocessing of TN: 1. interjections removal such as '啊, 呃' diff --git a/nemo_text_processing/text_normalization/zh/taggers/punctuation.py b/nemo_text_processing/text_normalization/zh/taggers/punctuation.py index cff124834..d6920c75d 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/punctuation.py +++ b/nemo_text_processing/text_normalization/zh/taggers/punctuation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + import sys from unicodedata import category @@ -36,7 +37,7 @@ class PunctuationFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="punctuation", kind="classify", deterministic=deterministic) - s = "!#%&\'()*+,-./:;<=>?@^_`{|}~\"。,;-《》“”" + s = "!#%&\'()*+,-/:;<=>?@^_`{|}~\"。,;-《》“”" punct_symbols_to_exclude = ["[", "]"] punct_unicode = [ diff --git a/nemo_text_processing/text_normalization/zh/taggers/time.py b/nemo_text_processing/text_normalization/zh/taggers/time.py index 283b8c47b..b0248d5c3 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/time.py +++ b/nemo_text_processing/text_normalization/zh/taggers/time.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -41,9 +41,9 @@ def __init__(self, deterministic: bool = True, lm: bool = False): # gramamr for time, separated by colons 05:03:13 symbol = pynutil.delete(":") | pynutil.delete(":") - hour_component = pynutil.insert("hour: \"") + hour + pynutil.insert('点') + pynutil.insert("\"") - minute_component = pynutil.insert("minute: \"") + minute + pynutil.insert('分') + pynutil.insert("\"") - second_component = pynutil.insert("second: \"") + second + pynutil.insert('秒') + pynutil.insert("\"") + hour_component = pynutil.insert("hours: \"") + hour + pynutil.insert('点') + pynutil.insert("\"") + minute_component = pynutil.insert("minutes: \"") + minute + pynutil.insert('分') + pynutil.insert("\"") + second_component = pynutil.insert("seconds: \"") + second + pynutil.insert('秒') + pynutil.insert("\"") # combining 3 components hour_minute_second = ( hour_component @@ -75,12 +75,12 @@ def __init__(self, deterministic: bool = True, lm: bool = False): minute_duration = pynini.accep("分钟") | pynini.accep('刻') | pynini.accep('刻钟') second_duration = pynini.accep("秒钟") | pynini.cross('秒鐘', '秒钟') | pynini.accep('秒') # combining two above - hour_component = pynutil.insert("hour: \"") + hour + (hour_clock | hour_duration) + pynutil.insert("\"") + hour_component = pynutil.insert("hours: \"") + hour + (hour_clock | hour_duration) + pynutil.insert("\"") minute_component = ( - pynutil.insert("minute: \"") + minute + (minute_clock | minute_duration) + pynutil.insert("\"") + pynutil.insert("minutes: \"") + minute + (minute_clock | minute_duration) + pynutil.insert("\"") ) second_component = ( - pynutil.insert("second: \"") + second + (second_clock | second_duration) + pynutil.insert("\"") + pynutil.insert("seconds: \"") + second + (second_clock | second_duration) + pynutil.insert("\"") ) hour_minute = hour_component + pynutil.insert(' ') + minute_component hour_second = hour_component + pynutil.insert(' ') + second_component @@ -97,7 +97,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): ) # gramamr for time, back count; 五点差n分n秒 - backcount = pynutil.insert("verb: \"") + pynini.accep('差') + pynutil.insert("\"") + backcount = pynutil.insert("morphosyntactic_features: \"") + pynini.accep('差') + pynutil.insert("\"") graph_hour = ( ( pynini.closure(backcount) diff --git a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py index 822f3d00f..d35ea178b 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,41 +12,32 @@ # See the License for the specific language governing permissions and # limitations under the License. + import os import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.zh.graph_utils import ( - NEMO_CHAR, - NEMO_DIGIT, - GraphFst, - delete_extra_space, - delete_space, - generator_main, -) +from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst, generator_main from nemo_text_processing.text_normalization.zh.taggers.cardinal import CardinalFst from nemo_text_processing.text_normalization.zh.taggers.date import DateFst from nemo_text_processing.text_normalization.zh.taggers.decimal import DecimalFst from nemo_text_processing.text_normalization.zh.taggers.fraction import FractionFst -from nemo_text_processing.text_normalization.zh.taggers.math_symbol import MathSymbol -from nemo_text_processing.text_normalization.zh.taggers.measure import Measure +from nemo_text_processing.text_normalization.zh.taggers.measure import MeasureFst from nemo_text_processing.text_normalization.zh.taggers.money import MoneyFst from nemo_text_processing.text_normalization.zh.taggers.ordinal import OrdinalFst -from nemo_text_processing.text_normalization.zh.taggers.preprocessor import PreProcessor from nemo_text_processing.text_normalization.zh.taggers.punctuation import PunctuationFst from nemo_text_processing.text_normalization.zh.taggers.time import TimeFst from nemo_text_processing.text_normalization.zh.taggers.whitelist import WhiteListFst -from nemo_text_processing.text_normalization.zh.taggers.word import Char -from nemo_text_processing.utils.logging import logger +from nemo_text_processing.text_normalization.zh.taggers.word import WordFst class ClassifyFst(GraphFst): """ - Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased. - For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, @@ -59,7 +50,7 @@ class ClassifyFst(GraphFst): def __init__( self, input_case: str, - deterministic: bool = False, + deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None, @@ -70,84 +61,40 @@ def __init__( if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" - far_file = os.path.join( - # cache_dir, f"_{input_case}_zh_tn_{deterministic}_deterministic{whitelist_file}.far" - cache_dir, - f"_{input_case}_zh_tn_{deterministic}_deterministic_{whitelist_file}.far", - ) + far_file = os.path.join(cache_dir, f"zh_tn_{deterministic}_deterministic_{whitelist_file}_tokenize.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] - no_digits = pynini.closure(pynini.difference(NEMO_CHAR, NEMO_DIGIT)) - self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize() - logger.info(f"ClassifyFst.fst was restored from {far_file}.") else: - logger.info(f"Creating ClassifyFst grammars. This might take some time...") - - cardinal = CardinalFst() - cardinal_graph = cardinal.fst - - ordinal = OrdinalFst(cardinal=cardinal) - ordinal_graph = ordinal.fst - - decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) - decimal_graph = decimal.fst - - fraction = FractionFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic) - fraction_graph = fraction.fst - + cardinal = CardinalFst(deterministic=deterministic) date = DateFst(deterministic=deterministic) - date_graph = date.fst - - word_graph = Char(deterministic=deterministic).fst - - self.time = TimeFst(deterministic=deterministic) - time_graph = self.time.fst - - money = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic) - money_graph = money.fst - - self.math = MathSymbol(deterministic=deterministic) - math_graph = self.math.fst - - self.measure = Measure(cardinal=cardinal, decimal=decimal, deterministic=deterministic) - measure_graph = self.measure.fst - - self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) - whitelist_graph = self.whitelist.fst - punct_graph = PunctuationFst(deterministic=deterministic).fst - - classify = ( - pynutil.add_weight(whitelist_graph, 1.001) - | pynutil.add_weight(cardinal_graph, -2.0) - | pynutil.add_weight(time_graph, 1.1) - | pynutil.add_weight(fraction_graph, -1.1) - | pynutil.add_weight(date_graph, -1.0) - | pynutil.add_weight(ordinal_graph, 1.1) - | pynutil.add_weight(decimal_graph, -1.0) - | pynutil.add_weight(money_graph, -1.1) - | pynutil.add_weight(math_graph, 1.1) - | pynutil.add_weight(measure_graph, -1.1) - | pynutil.add_weight(word_graph, 1.1) - ) - - classify |= pynutil.add_weight(word_graph, 100) - - punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") - token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") - token_plus_punct = ( - pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) + time = TimeFst(deterministic=deterministic) + fraction = FractionFst(cardinal=cardinal, deterministic=deterministic) + money = MoneyFst(cardinal=cardinal, deterministic=deterministic) + measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) + ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) + whitelist = WhiteListFst(deterministic=deterministic) + word = WordFst(deterministic=deterministic) + punctuation = PunctuationFst(deterministic=deterministic) + + classify = pynini.union( + pynutil.add_weight(date.fst, 1.1), + pynutil.add_weight(fraction.fst, 1.0), + pynutil.add_weight(money.fst, 1.1), + pynutil.add_weight(measure.fst, 1.05), + pynutil.add_weight(time.fst, 1.1), + pynutil.add_weight(whitelist.fst, 1.1), + pynutil.add_weight(cardinal.fst, 1.1), + pynutil.add_weight(decimal.fst, 3.05), + pynutil.add_weight(ordinal.fst, 1.1), + pynutil.add_weight(punctuation.fst, 1.0), + pynutil.add_weight(word.fst, 100), ) - graph = token_plus_punct + pynini.closure(pynutil.add_weight(delete_extra_space, 1.1) + token_plus_punct) - graph = delete_space + graph + delete_space - - # self.fst = graph.optimize() - tagger = graph.optimize() - preprocessor = PreProcessor(remove_interjections=True, fullwidth_to_halfwidth=True,) - self.fst = preprocessor.fst @ tagger + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") + tagger = pynini.closure(token, 1) - no_digits = pynini.closure(pynini.difference(NEMO_CHAR, NEMO_DIGIT)) - self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize() + self.fst = tagger if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) diff --git a/nemo_text_processing/text_normalization/zh/taggers/whitelist.py b/nemo_text_processing/text_normalization/zh/taggers/whitelist.py index 5b6196102..9015bd047 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/zh/taggers/whitelist.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,33 +33,25 @@ class WhiteListFst(GraphFst): input_file: path to a file with whitelist replacements """ - def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): + def __init__(self, deterministic: bool = True, input_file: str = None): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) - def _get_whitelist_graph(input_case, file): + def _get_whitelist_graph(file): whitelist = load_labels(file) - if input_case == "lower_cased": - whitelist = [[x[0].lower()] + x[1:] for x in whitelist] graph = pynini.string_map(whitelist) return graph - graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv")) - if not deterministic and input_case != "lower_cased": - graph |= pynutil.add_weight( - _get_whitelist_graph("lower_cased", get_abs_path("data/whitelist.tsv")), weight=0.0001 - ) + graph = _get_whitelist_graph(get_abs_path("data/whitelist.tsv")) + + graph |= pynutil.add_weight(_get_whitelist_graph(get_abs_path("data/whitelist.tsv")), weight=0.0001) if input_file: - whitelist_provided = _get_whitelist_graph(input_case, input_file) + whitelist_provided = _get_whitelist_graph(input_file) if not deterministic: graph |= whitelist_provided else: graph = whitelist_provided - if not deterministic: - units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measure/measurements.tsv")) - graph |= units_graph - self.graph = graph self.final_graph = convert_space(self.graph).optimize() self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize() diff --git a/nemo_text_processing/text_normalization/zh/taggers/word.py b/nemo_text_processing/text_normalization/zh/taggers/word.py index 776e4afdc..4e3b42b00 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/word.py +++ b/nemo_text_processing/text_normalization/zh/taggers/word.py @@ -1,30 +1,34 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pynini -from pynini.lib import pynutil - -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_SPACE, GraphFst - - -class Char(GraphFst): - ''' - 你 -> char { name: "你" } - ''' - - def __init__(self, deterministic: bool = True, lm: bool = False): - super().__init__(name="char", kind="classify", deterministic=deterministic) - - graph = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") - self.fst = graph.optimize() +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying word. + e.g. dormir -> tokens { name: "dormir" } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="word", kind="classify") + word = pynutil.insert("name: \"") + NEMO_NOT_QUOTE + pynutil.insert("\"") + self.fst = word.optimize() diff --git a/nemo_text_processing/text_normalization/zh/utils.py b/nemo_text_processing/text_normalization/zh/utils.py index d2748380e..4d08f1deb 100644 --- a/nemo_text_processing/text_normalization/zh/utils.py +++ b/nemo_text_processing/text_normalization/zh/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/zh/verbalizers/cardinal.py index 0cd9c3193..1a28241af 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,7 +30,7 @@ class CardinalFst(GraphFst): def __init__(self, deterministic: bool = True, lm: bool = False): super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic) - delete_sign = pynini.cross("negative: \"负\"", "负") | pynini.cross("positive: \"正\"", "正") + delete_sign = pynini.cross("negative: \"-\"", "负") delete_integer = ( pynutil.delete("integer: ") + pynutil.delete("\"") @@ -44,6 +44,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): ) graph_sign = delete_sign + delete_space + delete_integer final_graph = delete_integer | graph_sign | graph_mandarin + self.numbers = final_graph delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/date.py b/nemo_text_processing/text_normalization/zh/verbalizers/date.py index 86405bcff..f69f4a797 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + import pynini from pynini.lib import pynutil @@ -55,6 +56,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): optional_era = ( pynutil.delete("era: ") + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") ) + graph_date = ( pynini.closure(year_component) + pynini.closure(delete_space) @@ -62,7 +64,20 @@ def __init__(self, deterministic: bool = True, lm: bool = False): + pynini.closure(delete_space) + pynini.closure(day_component) ) - graph_date_era = optional_era + delete_space + graph_date + + graph_date_era = pynini.union( + (optional_era + delete_space + year_component), + (optional_era + delete_space + year_component + delete_space + month_component), + ( + optional_era + + delete_space + + year_component + + delete_space + + month_component + + delete_space + + day_component + ), + ) graph_date_all = graph_date | graph_date_era @@ -84,6 +99,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): ) final_graph = graph_date_all | graph_range + # final_graph = optional_era + delete_space + year_component delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/decimal.py b/nemo_text_processing/text_normalization/zh/verbalizers/decimal.py index 05fb2045e..795ab01a6 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,21 +30,9 @@ class DecimalFst(GraphFst): def __init__(self, deterministic: bool = True, lm: bool = False): super().__init__(name="decimal", kind="verbalize", deterministic=deterministic) - integer = ( - pynutil.delete("integer_part:") - + delete_space - + pynutil.delete("\"") - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") - ) + integer = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - fractional = ( - pynutil.delete("fractional_part:") - + delete_space - + pynutil.delete("\"") - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") - ) + fractional = pynutil.delete("fractional_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") quantity = ( pynutil.delete("quantity:") @@ -63,6 +51,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): ) graph = integer + delete_space + pynutil.insert("点") + fractional + self.decimal_regular = graph graph_quantity = graph + delete_space + quantity graph_regular = graph | graph_quantity diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/fraction.py b/nemo_text_processing/text_normalization/zh/verbalizers/fraction.py index 8207c1a22..c2a719c16 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/fraction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -72,10 +72,19 @@ def __init__(self, decimal: GraphFst, deterministic: bool = True, lm: bool = Fal ) graph_no_integer = denominator_part + delete_space + pynutil.insert('分之') + numerator_part graph = graph_with_integer | graph_no_integer - graph_with_sign = sign_part + delete_space + graph - graph_with_decimal = denominator_part + delete_space + pynutil.insert('分之') + graph_decimal + + graph_with_decimal = ( + denominator_part + + delete_space + + pynutil.insert('分之') + + pynutil.delete("integer_part: \"") + + pynini.closure(NEMO_NOT_QUOTE) + + pynutil.delete("\"") + ) + graph_with_sign = sign_part + delete_space + (graph | graph_with_decimal) final_graph = graph_with_sign | graph | graph_with_decimal + self.fraction = final_graph delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/math_symbol.py b/nemo_text_processing/text_normalization/zh/verbalizers/math_symbol.py deleted file mode 100644 index 59ef1c31a..000000000 --- a/nemo_text_processing/text_normalization/zh/verbalizers/math_symbol.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import pynini -from pynini.lib import pynutil - -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst - - -class MathSymbol(GraphFst): - ''' - tokens { sign: "加" } -> 加 - ''' - - def __init__(self, deterministic: bool = True, lm: bool = False): - super().__init__(name="sign", kind="verbalize", deterministic=deterministic) - - graph = pynutil.delete('score: \"') + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete('\"') - - self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/measure.py b/nemo_text_processing/text_normalization/zh/verbalizers/measure.py index ff4d0df07..00ba3b8ed 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/measure.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,41 +11,61 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space -class Measure(GraphFst): +class MeasureFst(GraphFst): ''' tokens { measure { cardinal: "一" } units: "千克" } } -> 一千克 ''' - def __init__(self, deterministic: bool = True, lm: bool = False): + def __init__( + self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True, lm: bool = False + ): super().__init__(name="measure", kind="verbalize", deterministic=deterministic) + cardinal = cardinal.numbers + decimal = decimal.decimal_component sign_component = pynutil.delete("negative: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") - integer_component = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") unit_component = pynutil.delete("units: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") - cardinal_graph = integer_component + delete_space + unit_component + graph_cardinal = ( + pynutil.delete("cardinal { ") + cardinal + pynutil.delete(" } ") + delete_space + unit_component + ) - decimal_graph = ( - pynutil.delete("integer_part: \"") - + pynini.closure(NEMO_NOT_QUOTE) - + pynutil.delete("\"") - + pynutil.insert("点") + graph_decimal = ( + pynutil.delete("decimal {") + + pynini.closure(pynutil.delete(NEMO_SPACE)) + + decimal + + pynini.closure(pynutil.delete(NEMO_SPACE)) + + pynutil.delete("}") + + pynini.closure(pynutil.delete(NEMO_SPACE)) + delete_space - + pynutil.delete("fractional_part: \"") - + pynini.closure(NEMO_NOT_QUOTE, 0) - + pynutil.delete("\"") + + unit_component + ) + + graph_fraction = ( + pynutil.delete("fraction {") + + pynini.closure(pynutil.delete(NEMO_SPACE)) + + fraction.fraction + + pynini.closure(pynutil.delete(NEMO_SPACE)) + + pynutil.delete("}") + + pynini.closure(pynutil.delete(NEMO_SPACE)) + delete_space - + pynutil.delete("units: \"") - + pynini.closure(NEMO_NOT_QUOTE) - + pynutil.delete("\"") + + unit_component ) - graph = pynini.closure(sign_component + delete_space) + (cardinal_graph | decimal_graph) + graph_math_cardinal = pynutil.delete("cardinal { ") + cardinal + pynutil.delete(" } ") + + graph_measures = graph_decimal | graph_cardinal | graph_fraction + graph_maths = graph_math_cardinal + + final_graph = graph_maths | graph_measures - self.fst = self.delete_tokens(graph).optimize() + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/money.py b/nemo_text_processing/text_normalization/zh/verbalizers/money.py index 9e121bbc6..74f517d01 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/money.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -31,13 +31,13 @@ def __init__(self, decimal: GraphFst, deterministic: bool = True, lm: bool = Fal super().__init__(name="money", kind="verbalize", deterministic=deterministic) # components to combine to make graphs - number_component = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + number_component = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") currency_component = pynutil.delete("currency: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") decimal_component = decimal.decimal_component unit_only_component = ( (pynutil.delete("currency: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"")) - | (pynutil.delete("currency_major: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"")) - | (pynutil.delete("currency_minor: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"")) + | (pynutil.delete("currency_maj: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"")) + | (pynutil.delete("currency_min: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"")) ) # graphs diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/zh/verbalizers/ordinal.py index 0379c06fe..d019355e2 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/ordinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,8 +25,8 @@ class OrdinalFst(GraphFst): tokens { ordinal { integer: "第一千万" } } -> 第一千万 """ - def __init__(self): - super().__init__(name="ordinal", kind="verbalize") + def __init__(self, deterministic: bool = True, lm: bool = False): + super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) symbol = pynini.union("-", "~", "——", "—") dash = pynini.cross(symbol, "到") diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/zh/verbalizers/post_processing.py new file mode 100644 index 000000000..4bafef0bd --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/verbalizers/post_processing.py @@ -0,0 +1,113 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os + +import pynini + +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_NOT_SPACE, + NEMO_SIGMA, + delete_space, + generator_main, +) +from nemo_text_processing.utils.logging import logger + + +class PostProcessingFst: + """ + Finite state transducer that post-processing an entire sentence after verbalization is complete, e.g. + removes extra spaces around punctuation marks " ( one hundred and twenty three ) " -> "(one hundred and twenty three)" + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, "zh_tn_post_processing.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["post_process_graph"] + logger.info(f'Post processing graph was restored from {far_file}.') + else: + self.set_punct_dict() + self.fst = self.get_punct_postprocess_graph() + + if far_file: + generator_main(far_file, {"post_process_graph": self.fst}) + + def set_punct_dict(self): + self.punct_marks = { + "'": [ + "'", + '´', + 'ʹ', + 'ʻ', + 'ʼ', + 'ʽ', + 'ʾ', + 'ˈ', + 'ˊ', + 'ˋ', + '˴', + 'ʹ', + '΄', + '՚', + '՝', + 'י', + '׳', + 'ߴ', + 'ߵ', + 'ᑊ', + 'ᛌ', + '᾽', + '᾿', + '`', + '´', + '῾', + '‘', + '’', + '‛', + '′', + '‵', + 'ꞌ', + ''', + '`', + '𖽑', + '𖽒', + ], + } + + def get_punct_postprocess_graph(self): + """ + Returns graph to post process punctuation marks. + + {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. + By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. + """ + + remove_space_around_single_quote = pynini.cdrewrite( + delete_space, NEMO_NOT_SPACE, NEMO_NOT_SPACE, pynini.closure(NEMO_SIGMA) + ) + # this works if spaces in between (good) + # delete space between 2 NEMO_NOT_SPACE(left and right to the space) that are with in a content of NEMO_SIGMA + + graph = remove_space_around_single_quote.optimize() + + return graph diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py b/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py index 36394843c..a63769787 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import pynini from pynini.lib import pynutil, utf8 diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/time.py b/nemo_text_processing/text_normalization/zh/verbalizers/time.py index aa3baf046..11105a916 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/time.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -38,9 +38,9 @@ def __init__(self, deterministic: bool = True): alphabet_pm = pynini.string_file(get_abs_path("data/time/PM.tsv")) # fundamental components - hour_component = pynutil.delete("hour: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") - minute_component = pynutil.delete("minute: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") - second_component = pynutil.delete("second: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + hour_component = pynutil.delete("hours: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + minute_component = pynutil.delete("minutes: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + second_component = pynutil.delete("seconds: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") graph_regular = ( hour_component | minute_component @@ -52,7 +52,7 @@ def __init__(self, deterministic: bool = True): ) # back count 三点差五分 - delete_verb = pynutil.delete("verb: \"") + pynini.accep("差") + pynutil.delete("\"") + delete_verb = pynutil.delete("morphosyntactic_features: \"") + pynini.accep("差") + pynutil.delete("\"") graph_back_count = ( ( pynini.closure(delete_verb + pynutil.insert(' ')) diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py index da4d64ca0..221fbcbc7 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,26 +11,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import pynini -from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst +from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst, delete_space from nemo_text_processing.text_normalization.zh.verbalizers.cardinal import CardinalFst from nemo_text_processing.text_normalization.zh.verbalizers.date import DateFst from nemo_text_processing.text_normalization.zh.verbalizers.decimal import DecimalFst from nemo_text_processing.text_normalization.zh.verbalizers.fraction import FractionFst -from nemo_text_processing.text_normalization.zh.verbalizers.math_symbol import MathSymbol -from nemo_text_processing.text_normalization.zh.verbalizers.measure import Measure +from nemo_text_processing.text_normalization.zh.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.zh.verbalizers.money import MoneyFst from nemo_text_processing.text_normalization.zh.verbalizers.ordinal import OrdinalFst from nemo_text_processing.text_normalization.zh.verbalizers.time import TimeFst from nemo_text_processing.text_normalization.zh.verbalizers.whitelist import Whitelist -from nemo_text_processing.text_normalization.zh.verbalizers.word import Char +from nemo_text_processing.text_normalization.zh.verbalizers.word import WordFst class VerbalizeFst(GraphFst): """ Composes other verbalizer grammars. - For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. Args: deterministic: if True will provide a single transduction option, @@ -42,28 +43,27 @@ def __init__(self, deterministic: bool = True): date = DateFst(deterministic=deterministic) cardinal = CardinalFst(deterministic=deterministic) - char = Char(deterministic=deterministic) + ordinal = OrdinalFst(deterministic=deterministic) decimal = DecimalFst(deterministic=deterministic) + word = WordFst(deterministic=deterministic) fraction = FractionFst(decimal=decimal, deterministic=deterministic) - math_symbol = MathSymbol(deterministic=deterministic) money = MoneyFst(decimal=decimal, deterministic=deterministic) - measure = Measure(deterministic=deterministic) - ordinal = OrdinalFst() + measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) time = TimeFst(deterministic=deterministic) whitelist = Whitelist(deterministic=deterministic) graph = pynini.union( date.fst, cardinal.fst, + ordinal.fst, decimal.fst, fraction.fst, - char.fst, - math_symbol.fst, + word.fst, money.fst, measure.fst, - ordinal.fst, time.fst, whitelist.fst, ) + graph = pynini.closure(delete_space) + graph + pynini.closure(delete_space) self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py index e4b0927d0..b16625530 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ from nemo_text_processing.text_normalization.zh.verbalizers.postprocessor import PostProcessor from nemo_text_processing.text_normalization.zh.verbalizers.verbalize import VerbalizeFst -# from nemo_text_processing.utils.logging import logger +# from nemo.utils import logging class VerbalizeFinalFst(GraphFst): @@ -38,6 +38,7 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ self.fst = pynini.Far(far_file, mode="r")["verbalize"] else: token_graph = VerbalizeFst(deterministic=deterministic) + token_verbalizer = ( pynutil.delete("tokens {") + delete_space + token_graph.fst + delete_space + pynutil.delete(" }") ) @@ -46,5 +47,3 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ postprocessor = PostProcessor(remove_puncts=False, to_upper=False, to_lower=False, tag_oov=False,) self.fst = (verbalizer @ postprocessor.fst).optimize() - if far_file: - generator_main(far_file, {"verbalize": self.fst}) diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py index 3be84e0a0..662cf9f28 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import pynini from pynini.lib import pynutil diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/word.py b/nemo_text_processing/text_normalization/zh/verbalizers/word.py index bdcafef96..f30f254c5 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/word.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/word.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,12 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + from pynini.lib import pynutil from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst -class Char(GraphFst): +class WordFst(GraphFst): ''' tokens { char: "你" } -> 你 ''' diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_cardinal.txt index 11f3f8cc1..573042f7a 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_cardinal.txt @@ -1,161 +1,85 @@ 10000~一万 -负10000~负一万 -正100000~正十万 +负1000~负一千 +100000~十万 210~二百一十 -负200~负二百 -负219~负二百一十九 +200~二百 +219~二百一十九 5000~五千 5100~五千一百 5110~五千一百一十 5111~五千一百一十一 5010~五千零一十 -5099~五千零九十九 +5010~五千零一十 5001~五千零一 -5,000~五千 -5,100~五千一百 -5,010~五千零一十 -5,001~五千零一 -50,000~五万 -51,000~五万一千 -50,100~五万零一百 -50,010~五万零一十 -51,100~五万一千一百 -51,110~五万一千一百一十 -50,011~五万零一十一 -50,001~五万零一 -50,010~五万零一十 50000~五万 51000~五万一千 50100~五万零一百 50010~五万零一十 -50001~五万零一 51100~五万一千一百 51110~五万一千一百一十 -51111~五万一千一百一十一 -50111~五万零一百一十一 50011~五万零一十一 -500,000~五十万 -510,000~五十一万 -501,000~五十万一千 -500,100~五十万零一百 -500,010~五十万零一十 -500,111~五十万零一百一十一 -501,100~五十万一千一百 -501,111~五十万一千一百一十一 -511,111~五十一万一千一百一十一 -100,000~十万 -110,000~十一万 +50001~五万零一 +50010~五万零一十 +50000~五万 +51000~五万一千 +50100~五万零一百 +50010~五万零一十 +50001~五万零一 +51100~五万一千一百 +50011~五万零一十一 +500000~五十万 +510000~五十一万 +501000~五十万一千 +500100~五十万零一百 +100000~十万 +110000~十一万 500000~五十万 510000~五十一万 501000~五十万一千 500100~五十万零一百 500010~五十万零一十 -500001~五十万零一 -500111~五十万零一百一十一 -501100~五十万一千一百 501111~五十万一千一百一十一 511111~五十一万一千一百一十一 100000~十万 110000~十一万 1100000~一百一十万 -1010000~一百零一万 -1001000~一百万一千 -1000100~一百万零一百 -1000010~一百万零一十 1000001~一百万零一 1000000~一百万 -1,000,000~一百万 -1,100,000~一百一十万 -1,010,000~一百零一万 -1,001,000~一百万一千 -1,000,100~一百万零一百 -1,000,010~一百万零一十 -1,000,001~一百万零一 10000000~一千万 11000000~一千一百万 10100000~一千零一十万 10010000~一千零一万 +11000000~一千一百万 +10100000~一千零一十万 +10010000~一千零一万 10001000~一千万一千 -10010100~一千零一万零一百 -10010010~一千零一万零一十 -10010001~一千零一万零一 -10,000,000~一千万 -11,000,000~一千一百万 -10,100,000~一千零一十万 -10,010,000~一千零一万 -10,001,000~一千万一千 -10,010,100~一千零一万零一百 -10,010,010~一千零一万零一十 -10,010,001~一千零一万零一 -101111111~一亿一百一十一万一千一百一十一 +101111111~一亿零一百一十一万一千一百一十一 110111111~一亿一千零一十一万一千一百一十一 111011111~一亿一千一百零一万一千一百一十一 111101111~一亿一千一百一十万一千一百一十一 111110111~一亿一千一百一十一万零一百一十一 -111111011~一亿一千一百一十一万一千零一十一 -111111101~一亿一千一百一十一万一千一百零一 -111111111~一亿一千一百一十一万一千一百一十一 -101,111,111~一亿一百一十一万一千一百一十一 -110,111,111~一亿一千零一十一万一千一百一十一 -111,011,111~一亿一千一百零一万一千一百一十一 -111,101,111~一亿一千一百一十万一千一百一十一 -111,110,111~一亿一千一百一十一万零一百一十一 -111,111,011~一亿一千一百一十一万一千零一十一 -111,111,101~一亿一千一百一十一万一千一百零一 -111,111,110~一亿一千一百一十一万一千一百一十 -111,111,111~一亿一千一百一十一万一千一百一十一 +101111111~一亿零一百一十一万一千一百一十一 +110111111~一亿一千零一十一万一千一百一十一 +111011111~一亿一千一百零一万一千一百一十一 +111101111~一亿一千一百一十万一千一百一十一 1011111111~十亿一千一百一十一万一千一百一十一 -1101111111~十一亿一百一十一万一千一百一十一 -1110111111~十一亿一千零一十一万一千一百一十一 +1101111111~十一亿零一百一十一万一千一百一十一 1111011111~十一亿一千一百零一万一千一百一十一 -1111110111~十一亿一千一百一十一万零一百一十一 -1111111101~十一亿一千一百一十一万一千一百零一 -1111111111~十一亿一千一百一十一万一千一百一十一 -1,011,111,111~十亿一千一百一十一万一千一百一十一 -1,101,111,111~十一亿一百一十一万一千一百一十一 -1,110,111,111~十一亿一千零一十一万一千一百一十一 -1,111,011,111~十一亿一千一百零一万一千一百一十一 -1,111,101,111~十一亿一千一百一十万一千一百一十一 -1,111,110,111~十一亿一千一百一十一万零一百一十一 -1,111,111,011~十一亿一千一百一十一万一千零一十一 -1,111,111,101~十一亿一千一百一十一万一千一百零一 -1,111,111,110~十一亿一千一百一十一万一千一百一十 +11000000000~一百一十亿 +10000100000~一百亿零十万 11000000000~一百一十亿 10100000000~一百零一亿 10010000000~一百亿一千万 -10001000000~一百亿一百万 -10000100000~一百亿零十万 -11,000,000,000~一百一十亿 -10,100,000,000~一百零一亿 -10,010,000,000~一百亿一千万 -10,001,000,000~一百亿一百万 -10,000,100,000~一百亿零十万 -10,000,010,000~一百亿零一万 -10,000,001,000~一百亿零一千 -10,000,000,100~一百亿零一百 -10,000,000,010~一百亿零一十 -10,000,000,001~一百亿零一 +10001000000~一百亿零一百万 +10000000001~一百亿零一 100000100000~一千亿零十万 100000010000~一千亿零一万 100000001000~一千亿零一千 -100000000100~一千亿零一百 -100000000010~一千亿零一十 -100000000001~一千亿零一 -100,000,000,000~一千亿 -110,000,000,000~一千一百亿 -101,000,000,000~一千零一十亿 -100,100,000,000~一千零一亿 -100,010,000,000~一千亿一千万 -100,001,000,000~一千亿一百万 -100,000,100,000~一千亿零十万 -100,000,010,000~一千亿零一万 -100,000,001,000~一千亿零一千 -100,000,000,100~一千亿零一百 -20万~二十万 -5万~五万 -100万~一百万 -1500万~一千五百万 -20亿~二十亿 -100亿~一百亿 -1500亿~一千五百亿 -9亿~九亿 +101000000000~一千零一十亿 +100100000000~一千零一亿 +我今天买了5个苹果~我今天买了五个苹果 +我今天买了25个苹果~我今天买了二十五个苹果 +我今天买了35个苹果~我今天买了三十五个苹果 +我今天买了50000个苹果~我今天买了五万个苹果 +我今天买了150000个苹果~我今天买了十五万个苹果 +双辽境内除东辽河、西辽河等5条河流~双辽境内除东辽河、西辽河等五条河流 diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_date.txt index c0963e9d0..52ab15f44 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_date.txt @@ -27,4 +27,9 @@ 2020年11月24日~二零二零年十一月二十四日 公元2020年11月24日~公元二零二零年十一月二十四日 1823年3月bc~公元前一八二三年三月 -纪元2013年~纪元二零一三年 \ No newline at end of file +纪元2013年~纪元二零一三年 +今天是2013年3月3日~今天是二零一三年三月三日 +现在是12月25日圣诞节~现在是十二月二十五日圣诞节 +文件上的标注日期是12/2/2~文件上的标注日期是一二年二月二日 +现在是入冬的12月~现在是入冬的十二月 +公元前202年西汉成立~公元前二零二年西汉成立 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_decimal.txt index 10e49a97f..4d53200fd 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_decimal.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_decimal.txt @@ -5,26 +5,31 @@ -5.5555~负五点五五五五 1890.5555~一千八百九十点五五五五 20.123~二十点一二三 --2930.1929~负二千九百三十点一九二九 +-2930.1929~负两千九百三十点一九二九 0.5~零点五 5.0~五点零 10.567~十点五六七 -123.123~负一百二十三点一二三 3123.1231~三千一百二十三点一二三一 -123123123.12312334234~一亿二千三百一十二万三千一百二十三点一二三一二三三四二三四 +123123123.12312334234~一亿两千三百一十二万三千一百二十三点一二三一二三三四二三四 83888123.2398412~八千三百八十八万八千一百二十三点二三九八四一二 283818.28994万~二十八万三千八百一十八点二八九九四万 -28394919.2312亿~二千八百三十九万四千九百一十九点二三一二亿 +28394919.2312亿~两千八百三十九万四千九百一十九点二三一二亿 23.23万~二十三点二三万 1233.1亿~一千二百三十三点一亿 -123.213万~负一百二十三点二一三万 123.890万~一百二十三点八九零万 -233123.9940亿~负二十三万三千一百二十三点九九四零亿 283943.234123亿~二十八万三千九百四十三点二三四一二三亿 -2391.2318~二千三百九十一点二三一八 +2391.2318~两千三百九十一点二三一八 -1.5亿~负一点五亿 1.5亿~一点五亿 10.67亿~十点六七亿 16.3亿~十六点三亿 12.2亿~十二点二亿 -2342.2342亿~二千三百四十二点二三四二亿 \ No newline at end of file +2342.2342亿~两千三百四十二点二三四二亿 +公司的年收益率是6.5~公司的年收益率是六点五 +现在的室外气温是36.7摄氏度~现在的室外气温是三十六点七摄氏度 +我们可以给你返还1.2个百分点~我们可以给你返还一点二个百分点 +全球现今有71.5亿人~全球现今有七十一点五亿人 +小张的资产值13.5亿~小张的资产值十三点五亿 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_fraction.txt index bae47330d..c39f680f4 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_fraction.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_fraction.txt @@ -4,11 +4,7 @@ -1/3~负三分之一 负1/3~负三分之一 1/2~二分之一 -+1/2~正二分之一 -正1/2~正二分之一 1/10~十分之一 -+100分之1~正一百分之一 -正100分之1~正一百分之一 98%~百分之九十八 -98%~负百分之九十八 负98%~负百分之九十八 @@ -16,19 +12,15 @@ -百分之1~负百分之一 负百分之1~负百分之一 -100分之57~负一百分之五十七 -正100分之57~正一百分之五十七 负100分之57~负一百分之五十七 1/5~五分之一 -1/5~负五分之一 -+1/5~正五分之一 1又1/5~一又五分之一 -+1又1/5~正一又五分之一 -正1又1/5~正一又五分之一 5又2分之1~五又二分之一 -+5又2分之1~正五又二分之一 -正5又2分之1~正五又二分之一 -1/100~一百分之一 -+1/100~正一百分之一 -正1/100~正一百分之一 +0.4%~百分之零点四 6.3%~百分之六点三 -0.4%~百分之零点四 \ No newline at end of file +公司的年收益率是6.5%~公司的年收益率是百分之六点五 +60%的人口是男性~百分之六十的人口是男性 +全校有80%的学生来自大山深处的贫困地区~全校有百分之八十的学生来自大山深处的贫困地区 +我们的队伍有1/3的人是士官学校出身~我们的队伍有三分之一的人是士官学校出身 +今年的降雨量较往年多了5%~今年的降雨量较往年多了百分之五 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_math.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_math.txt deleted file mode 100644 index d31a3a8d4..000000000 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_math.txt +++ /dev/null @@ -1,2 +0,0 @@ -78:96~七十八比九十六 -±2~正负二 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_measure.txt index 094afc7c4..d220c406a 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_measure.txt @@ -2,4 +2,10 @@ 38°C~三十八摄氏度 120m²~一百二十平方米 10ms~十毫秒 +25千克~二十五千克 -23°C~负二十三摄氏度 +1.2g~一点二克 +测量机显示重量是25kg~测量机显示重量是二十五千克 +现在的室外温度是38°C~现在的室外温度是三十八摄氏度 +这个房子大概有120m²~这个房子大概有一百二十平方米 +整体时长大概是10ms~整体时长大概是十毫秒 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_money.txt index 28075fca1..71dd98d71 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_money.txt @@ -12,7 +12,11 @@ $100~一百美元 5角~五角 5块~五块 6毛~六毛 -5块5毛5分~五块五毛五分 -1.5万美元~一点五万美元 $1.5万~一点五万美元 -3.5万韩元~三点五万韩元 \ No newline at end of file +3.5万韩元~三点五万韩元 +1.5万美元~一点五万美元 +我现在能拿出的现金是100000美元~我现在能拿出的现金是十万美元 +一份煎饼卖5块一份~一份煎饼卖五块一份 +每100美元能兑换700人民币左右~每一百美元能兑换七百人民币左右 +您的银行账户余额为$500~您的银行账户余额为五百美元 +洛杉矶的最低工资是每小时$15~洛杉矶的最低工资是每小时十五美元 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_ordinal.txt index 57ea76bc1..e84b3dd8d 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_ordinal.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_ordinal.txt @@ -1,149 +1,84 @@ 第10000~第一万 第210~第二百一十 第5000~第五千 -第5100~第五千一百 -第5110~第五千一百一十 -第5111~第五千一百一十一 -第5010~第五千零一十 第5099~第五千零九十九 第5001~第五千零一 -第5,000~第五千 -第5,100~第五千一百 -第5,010~第五千零一十 -第5,001~第五千零一 -第50,000~第五万 -第51,000~第五万一千 -第50,100~第五万零一百 -第50,010~第五万零一十 -第51,100~第五万一千一百 -第51,110~第五万一千一百一十 -第50,011~第五万零一十一 -第50,001~第五万零一 -第50,010~第五万零一十 +第5000~第五千 +第5100~第五千一百 +第50010~第五万零一十 +第51100~第五万一千一百 +第50010~第五万零一十 第50000~第五万 第51000~第五万一千 第50100~第五万零一百 第50010~第五万零一十 -第50001~第五万零一 -第51100~第五万一千一百 -第51110~第五万一千一百一十 -第51111~第五万一千一百一十一 第50111~第五万零一百一十一 第50011~第五万零一十一 -第500,000~第五十万 -第510,000~第五十一万 -第501,000~第五十万一千 -第500,100~第五十万零一百 -第500,010~第五十万零一十 -第500,111~第五十万零一百一十一 -第501,100~第五十万一千一百 -第501,111~第五十万一千一百一十一 -第511,111~第五十一万一千一百一十一 -第100,000~第十万 -第110,000~第十一万 +第500000~第五十万 +第510000~第五十一万 +第501000~第五十万一千 +第100000~第十万 +第110000~第十一万 第500000~第五十万 第510000~第五十一万 第501000~第五十万一千 第500100~第五十万零一百 -第500010~第五十万零一十 -第500001~第五十万零一 -第500111~第五十万零一百一十一 -第501100~第五十万一千一百 第501111~第五十万一千一百一十一 第511111~第五十一万一千一百一十一 第100000~第十万 -第110000~第十一万 -第1100000~第一百一十万 -第1010000~第一百零一万 -第1001000~第一百万一千 第1000100~第一百万零一百 第1000010~第一百万零一十 第1000001~第一百万零一 第1000000~第一百万 -第1,000,000~第一百万 -第1,100,000~第一百一十万 -第1,010,000~第一百零一万 -第1,001,000~第一百万一千 -第1,000,100~第一百万零一百 -第1,000,010~第一百万零一十 -第1,000,001~第一百万零一 +第1000100~第一百万零一百 +第1000010~第一百万零一十 +第1000001~第一百万零一 +第10000000~第一千万 +第11000000~第一千一百万 +第10010001~第一千零一万零一 第10000000~第一千万 第11000000~第一千一百万 -第10100000~第一千零一十万 -第10010000~第一千零一万 -第10001000~第一千万一千 -第10010100~第一千零一万零一百 -第10010010~第一千零一万零一十 第10010001~第一千零一万零一 -第10,000,000~第一千万 -第11,000,000~第一千一百万 -第10,100,000~第一千零一十万 -第10,010,000~第一千零一万 -第10,001,000~第一千万一千 -第10,010,100~第一千零一万零一百 -第10,010,010~第一千零一万零一十 -第10,010,001~第一千零一万零一 -第101111111~第一亿一百一十一万一千一百一十一 +第101111111~第一亿零一百一十一万一千一百一十一 第110111111~第一亿一千零一十一万一千一百一十一 第111011111~第一亿一千一百零一万一千一百一十一 第111101111~第一亿一千一百一十万一千一百一十一 -第111110111~第一亿一千一百一十一万零一百一十一 -第111111011~第一亿一千一百一十一万一千零一十一 -第111111101~第一亿一千一百一十一万一千一百零一 +第101111111~第一亿零一百一十一万一千一百一十一 +第110111111~第一亿一千零一十一万一千一百一十一 +第111111110~第一亿一千一百一十一万一千一百一十 第111111111~第一亿一千一百一十一万一千一百一十一 -第101,111,111~第一亿一百一十一万一千一百一十一 -第110,111,111~第一亿一千零一十一万一千一百一十一 -第111,011,111~第一亿一千一百零一万一千一百一十一 -第111,101,111~第一亿一千一百一十万一千一百一十一 -第111,110,111~第一亿一千一百一十一万零一百一十一 -第111,111,011~第一亿一千一百一十一万一千零一十一 -第111,111,101~第一亿一千一百一十一万一千一百零一 -第111,111,110~第一亿一千一百一十一万一千一百一十 -第111,111,111~第一亿一千一百一十一万一千一百一十一 第1011111111~第十亿一千一百一十一万一千一百一十一 -第1101111111~第十一亿一百一十一万一千一百一十一 +第1101111111~第十一亿零一百一十一万一千一百一十一 第1110111111~第十一亿一千零一十一万一千一百一十一 第1111011111~第十一亿一千一百零一万一千一百一十一 第1111110111~第十一亿一千一百一十一万零一百一十一 第1111111101~第十一亿一千一百一十一万一千一百零一 第1111111111~第十一亿一千一百一十一万一千一百一十一 -第1,011,111,111~第十亿一千一百一十一万一千一百一十一 -第1,101,111,111~第十一亿一百一十一万一千一百一十一 -第1,110,111,111~第十一亿一千零一十一万一千一百一十一 -第1,111,011,111~第十一亿一千一百零一万一千一百一十一 -第1,111,101,111~第十一亿一千一百一十万一千一百一十一 -第1,111,110,111~第十一亿一千一百一十一万零一百一十一 -第1,111,111,011~第十一亿一千一百一十一万一千零一十一 -第1,111,111,101~第十一亿一千一百一十一万一千一百零一 -第1,111,111,110~第十一亿一千一百一十一万一千一百一十 +第1011111111~第十亿一千一百一十一万一千一百一十一 +第1101111111~第十一亿零一百一十一万一千一百一十一 +第1110111111~第十一亿一千零一十一万一千一百一十一 +第11000000000~第一百一十亿 +第10000100000~第一百亿零十万 第11000000000~第一百一十亿 第10100000000~第一百零一亿 第10010000000~第一百亿一千万 -第10001000000~第一百亿一百万 +第10001000000~第一百亿零一百万 第10000100000~第一百亿零十万 -第11,000,000,000~第一百一十亿 -第10,100,000,000~第一百零一亿 -第10,010,000,000~第一百亿一千万 -第10,001,000,000~第一百亿一百万 -第10,000,100,000~第一百亿零十万 -第10,000,010,000~第一百亿零一万 -第10,000,001,000~第一百亿零一千 -第10,000,000,100~第一百亿零一百 -第10,000,000,010~第一百亿零一十 -第10,000,000,001~第一百亿零一 +第10000000100~第一百亿零一百 +第10000000010~第一百亿零一十 +第10000000001~第一百亿零一 第100000100000~第一千亿零十万 第100000010000~第一千亿零一万 第100000001000~第一千亿零一千 -第100000000100~第一千亿零一百 第100000000010~第一千亿零一十 第100000000001~第一千亿零一 -第100,000,000,000~第一千亿 -第110,000,000,000~第一千一百亿 -第101,000,000,000~第一千零一十亿 -第100,100,000,000~第一千零一亿 -第100,010,000,000~第一千亿一千万 -第100,001,000,000~第一千亿一百万 -第100,000,100,000~第一千亿零十万 -第100,000,010,000~第一千亿零一万 -第100,000,001,000~第一千亿零一千 -第100,000,000,100~第一千亿零一百 \ No newline at end of file +第100000000000~第一千亿 +第110000000000~第一千一百亿 +第101000000000~第一千零一十亿 +第100100000000~第一千零一亿 +第100010000000~第一千亿一千万 +这个孩子的学习成绩一直是全年级第1~这个孩子的学习成绩一直是全年级第一 +从这一排往下数第5个就是小明~从这一排往下数第五个就是小明 +恭喜您成为本店第100名顾客~恭喜您成为本店第一百名顾客 +这是你人生的第1桶金~这是你人生的第一桶金 +这个名单从头开始到第100都是你的目标客户~这个名单从头开始到第一百都是你的目标客户 diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_preprocess.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_preprocess.txt deleted file mode 100644 index e1b592ebc..000000000 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_preprocess.txt +++ /dev/null @@ -1 +0,0 @@ -你啊好~你好 diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_time.txt index 94b45ac30..9523492a0 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_time.txt @@ -1,4 +1,4 @@ -3:4:5~三点四分五秒 +03:04:05~三点四分五秒 03:04:05~三点四分五秒 00:00:00~零点零分零秒 03:04:05~三点四分五秒 @@ -27,4 +27,8 @@ 5点差3分~五点差三分 5点差5分~五点差五分 5点差4分am~五点差四分am -3个小时15分钟30秒~三个小时十五分钟三十秒 \ No newline at end of file +3个小时15分钟30秒~三个小时十五分钟三十秒 +现在是北京时间下午03:04:05~现在是北京时间下午三点四分五秒 +航班预计会延误5个小时~航班预计会延误五个小时 +大家尽量把手表对准调到5点1刻~大家尽量把手表对准调到五点一刻 +5点1刻离六点就差十五分钟~五点一刻离六点就差十五分钟 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..1700f1af6 --- /dev/null +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,10 @@ +这附近有Atm~这附近有ATM +这是一个ufo的照片~这是一个UFO的照片 +nba比赛如期举行~NBA比赛如期举行 +我们需要升级gpu~我们需要升级GPU +他是这个公司的c e o~他是这个公司的CEO +我们已经加入了wto~我们已经加入了WTO +小王以优秀的战绩成为这场游戏的mvp~小王以优秀的战绩成为这场游戏的MVP +这位客人是我们的vip~这位客人是我们的VIP +小王的iq是全班最高的~小王的IQ是全班最高的 +小李读了一个mba~小李读了一个MBA \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt index 4fedd2cd9..23270bf82 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt @@ -1,2 +1,7 @@ 你~你 -好~好 \ No newline at end of file +好~好 +你好今天的天气不错~你好今天的天气不错 +只有智商超过一定数值的人才能破解~只有智商超过一定数值的人才能破解 +这是由人工智能控制的系统~这是由人工智能控制的系统 +欧洲旅游目的地多到不知道怎么选~欧洲旅游目的地多到不知道怎么选 +马斯科卖掉豪宅住进折叠屋~马斯科卖掉豪宅住进折叠屋 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/test_math.py b/tests/nemo_text_processing/zh/test_math.py deleted file mode 100644 index cf44a5c22..000000000 --- a/tests/nemo_text_processing/zh/test_math.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from parameterized import parameterized - -from nemo_text_processing.text_normalization.normalize import Normalizer - -from ..utils import CACHE_DIR, parse_test_case_file - - -class TestMath: - normalizer_zh = Normalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') - - @parameterized.expand(parse_test_case_file('zh/data_text_normalization/test_cases_math.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_norm_math(self, test_input, expected): - preds = self.normalizer_zh.normalize(test_input) - assert expected == preds diff --git a/tests/nemo_text_processing/zh/test_preprocess.py b/tests/nemo_text_processing/zh/test_preprocess.py deleted file mode 100644 index 34838cc90..000000000 --- a/tests/nemo_text_processing/zh/test_preprocess.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from parameterized import parameterized - -from nemo_text_processing.text_normalization.normalize import Normalizer - -from ..utils import CACHE_DIR, parse_test_case_file - - -class TestPreprocess: - normalizer_zh = Normalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') - - @parameterized.expand(parse_test_case_file('zh/data_text_normalization/test_cases_preprocess.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_norm_preprocess(self, test_input, expected): - preds = self.normalizer_zh.normalize(test_input) - assert expected == preds diff --git a/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh index 4cbbf0d0d..dd352b42b 100644 --- a/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh @@ -12,7 +12,7 @@ runtest () { while read testcase; do IFS='~' read written spoken <<< $testcase # replace non breaking space with breaking space - denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration_pp.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') # # trim white space spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" @@ -23,35 +23,42 @@ runtest () { done < "$input" } -testTNMoneyText() { - input=$PROJECT_DIR/zh/data_text_normalization/test_cases_money.txt +testTNTimeText() { + input=$PROJECT_DIR/zh/data_text_normalization/test_cases_time.txt runtest $input } -testTNCharText() { - input=$PROJECT_DIR/zh/data_text_normalization/test_cases_char.txt +testTNCardinalText() { + input=$PROJECT_DIR/zh/data_text_normalization/test_cases_cardinal.txt runtest $input } -testTNTimeText() { - input=$PROJECT_DIR/zh/data_text_normalization/test_cases_time.txt +testTNOrdinalText() { + input=$PROJECT_DIR/zh/data_text_normalization/test_cases_ordinal.txt runtest $input } -testTNDateText() { - input=$PROJECT_DIR/zh/data_text_normalization/test_cases_date.txt +testTNDecimalalText() { + input=$PROJECT_DIR/zh/data_text_normalization/test_cases_decimal.txt runtest $input } -# testTNMathText() { -# input=$PROJECT_DIR/zh/data_text_normalization/test_cases_math.txt -# runtest $input -# } testTNFractionText() { input=$PROJECT_DIR/zh/data_text_normalization/test_cases_fraction.txt runtest $input } - -# testTNPreprocessText() { -# input=$PROJECT_DIR/zh/data_text_normalization/test_cases_preprocess.txt -# runtest $input -# } +testTNDateText() { + input=$PROJECT_DIR/zh/data_text_normalization/test_cases_date.txt + runtest $input +} +testTNMoneyText() { + input=$PROJECT_DIR/zh/data_text_normalization/test_cases_money.txt + runtest $input +} +testTNWordText() { + input=$PROJECT_DIR/zh/data_text_normalization/test_cases_word.txt + runtest $input +} +testTNWhitelistText() { + input=$PROJECT_DIR/zh/data_text_normalization/test_cases_whitelist.txt + runtest $input +} testTNMeasureText() { input=$PROJECT_DIR/zh/data_text_normalization/test_cases_measure.txt runtest $input @@ -59,4 +66,5 @@ testTNMeasureText() { # Load shUnit2 -. $PROJECT_DIR/../shunit2/shunit2 +#. $PROJECT_DIR/../shunit2/shunit2 +. /workspace/shunit2/shunit2 diff --git a/tests/nemo_text_processing/zh/test_time.py b/tests/nemo_text_processing/zh/test_time.py index ed285983b..590fd591f 100644 --- a/tests/nemo_text_processing/zh/test_time.py +++ b/tests/nemo_text_processing/zh/test_time.py @@ -31,11 +31,11 @@ def test_norm_time(self, test_input, expected): preds = self.normalizer_zh.normalize(test_input) assert expected == preds - # inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) - - # @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_time.txt')) - # @pytest.mark.run_only_on('CPU') - # @pytest.mark.unit - # def test_denorm(self, test_input, expected): - # pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - # assert pred == expected + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_whitelist.py b/tests/nemo_text_processing/zh/test_whitelist.py index 8e6087f53..deb857e7a 100644 --- a/tests/nemo_text_processing/zh/test_whitelist.py +++ b/tests/nemo_text_processing/zh/test_whitelist.py @@ -1,32 +1,41 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pytest -from parameterized import parameterized - -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer - -from ..utils import CACHE_DIR, parse_test_case_file - - -class TestWhitelist: - inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) - - @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_whitelist.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred == expected +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDate: + normalizer_zh = Normalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + + @parameterized.expand(parse_test_case_file('zh/data_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm_date(self, test_input, expected): + preds = self.normalizer_zh.normalize(test_input) + assert expected == preds + + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_word.py b/tests/nemo_text_processing/zh/test_word.py index ddf587857..3314ea90b 100644 --- a/tests/nemo_text_processing/zh/test_word.py +++ b/tests/nemo_text_processing/zh/test_word.py @@ -17,6 +17,7 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -30,3 +31,12 @@ class TestWord: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer_zh = Normalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + + @parameterized.expand(parse_test_case_file('zh/data_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm_date(self, test_input, expected): + preds = self.normalizer_zh.normalize(test_input) + assert expected == preds diff --git a/tools/text_processing_deployment/docker/launch.sh b/tools/text_processing_deployment/docker/launch.sh index dea998f7b..1bb4c78ca 100644 --- a/tools/text_processing_deployment/docker/launch.sh +++ b/tools/text_processing_deployment/docker/launch.sh @@ -57,4 +57,4 @@ docker run -it -e LANG=C.UTF-8 -e LC_ALL=C.UTF-8 --rm \ $MOUNTS \ -v $SCRIPT_DIR/../../../tests/nemo_text_processing/:/workspace/tests/ \ -w $WORK_DIR \ - sparrowhawk:latest $CMD \ No newline at end of file + sparrowhawk:latest $CMD diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 7969ee239..427bbaf6e 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -213,6 +213,9 @@ def parse_args(): from nemo_text_processing.text_normalization.zh.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, ) + from nemo_text_processing.text_normalization.zh.verbalizers.post_processing import ( + PostProcessingFst as TNPostProcessingFst, + ) from nemo_text_processing.text_normalization.zh.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst elif args.language == 'ar': from nemo_text_processing.inverse_text_normalization.ar.taggers.tokenize_and_classify import ( @@ -250,10 +253,6 @@ def parse_args(): from nemo_text_processing.inverse_text_normalization.hy.verbalizers.verbalize import ( VerbalizeFst as ITNVerbalizeFst, ) - from nemo_text_processing.text_normalization.hy.taggers.tokenize_and_classify import ( - ClassifyFst as TNClassifyFst, - ) - from nemo_text_processing.text_normalization.hy.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") export_grammars( output_dir=output_dir, From ec331daf9ab5a9f64a0e841fd5668acfbc8d16e4 Mon Sep 17 00:00:00 2001 From: tbartley94 <90423858+tbartley94@users.noreply.github.com> Date: Fri, 3 May 2024 14:38:04 -0700 Subject: [PATCH 20/90] preparing release, updating change log (#168) * preparing release, updating change log Signed-off-by: Travis Bartley * adding changelog Signed-off-by: Travis Bartley * updating pre release Signed-off-by: Travis Bartley --------- Signed-off-by: Travis Bartley Signed-off-by: Alex Cui --- CHANGELOG.md | 29 ++++++++++++++++++++++++++++ nemo_text_processing/package_info.py | 6 +++--- 2 files changed, 32 insertions(+), 3 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 000000000..afd15a19c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,29 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [r1.0.0] - 2024-05-03 + +### Added + +- Sentence level ZH (Mandarin Chinese) TN (#112) +- Enabled post-processing support for Sparrowhawk TN test (#147) + +### Fixed + +- `normalize_with_audio` text-field variable changed (#153) +- `run_evaluate` script for ITN updated for additional languages and casing (#164) + +### Changed + +- Docstring update (#157) + + +### Removed + +- Removed unused function from AR (Arabic) TN decimals (#165) + + diff --git a/nemo_text_processing/package_info.py b/nemo_text_processing/package_info.py index 2638d7176..774f86da6 100644 --- a/nemo_text_processing/package_info.py +++ b/nemo_text_processing/package_info.py @@ -13,10 +13,10 @@ # limitations under the License. -MAJOR = 0 -MINOR = 3 +MAJOR = 1 +MINOR = 0 PATCH = 0 -PRE_RELEASE = 'rc0' +PRE_RELEASE = 'r' # Use the following formatting: (major, minor, patch, pre-release) VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE) From 61054ba1f8ac06027c798c330f683ad1458c341a Mon Sep 17 00:00:00 2001 From: Evelina <10428420+ekmb@users.noreply.github.com> Date: Fri, 3 May 2024 14:41:21 -0700 Subject: [PATCH 21/90] hotfix (#169) Signed-off-by: Travis Bartley Co-authored-by: Travis Bartley Signed-off-by: Alex Cui --- CHANGELOG.md | 2 +- nemo_text_processing/package_info.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index afd15a19c..3d6155792 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [r1.0.0] - 2024-05-03 +## [r1.0.1] - 2024-05-03 ### Added diff --git a/nemo_text_processing/package_info.py b/nemo_text_processing/package_info.py index 774f86da6..5b1a16af2 100644 --- a/nemo_text_processing/package_info.py +++ b/nemo_text_processing/package_info.py @@ -15,7 +15,7 @@ MAJOR = 1 MINOR = 0 -PATCH = 0 +PATCH = 1 PRE_RELEASE = 'r' # Use the following formatting: (major, minor, patch, pre-release) From 498781fe87da455709997d6ec6529d0e17ef211a Mon Sep 17 00:00:00 2001 From: tbartley94 <90423858+tbartley94@users.noreply.github.com> Date: Fri, 3 May 2024 14:49:41 -0700 Subject: [PATCH 22/90] hotfix (#170) Signed-off-by: Travis Bartley Signed-off-by: Alex Cui --- CHANGELOG.md | 2 +- nemo_text_processing/package_info.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d6155792..7eefd953c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [r1.0.1] - 2024-05-03 +## [r1.0.2] - 2024-05-03 ### Added diff --git a/nemo_text_processing/package_info.py b/nemo_text_processing/package_info.py index 5b1a16af2..84dfa4da6 100644 --- a/nemo_text_processing/package_info.py +++ b/nemo_text_processing/package_info.py @@ -15,8 +15,8 @@ MAJOR = 1 MINOR = 0 -PATCH = 1 -PRE_RELEASE = 'r' +PATCH = 2 +PRE_RELEASE = '' # Use the following formatting: (major, minor, patch, pre-release) VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE) From d290748b119d86ca4e6221e7b3d3ab93205d2ab9 Mon Sep 17 00:00:00 2001 From: Simon Zuberek Date: Thu, 6 Jun 2024 18:07:29 -0400 Subject: [PATCH 23/90] DE TN Fixes (#177) * Adds support for social media tags (e.g. @zoobereq) Signed-off-by: Simon Zuberek * Adds test cases for social media tags Signed-off-by: Simon Zuberek * Fixes pathing for Sparrowhawk Signed-off-by: Simon Zuberek * Fixes the issue of the DE normalizer not accepting comma-separated digit strings Signed-off-by: Simon Zuberek * Fixes the issue where the normalizer didn't accept time formatted as 00.00 Uhr or 0.00 Uhr Signed-off-by: Simon Zuberek * Fixes the issue where the the sentence-final period in sentences ending with a domain name would be tagged as part of that domain name Signed-off-by: Simon Zuberek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Removes unused imports Signed-off-by: Simon Zuberek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixes the formatting Signed-off-by: Simon Zuberek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixes https://github.com/NVIDIA/NeMo-text-processing/issues/166 for DE Signed-off-by: Simon Zuberek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updates grammar paths Signed-off-by: Simon Zuberek * Minor Fixes Signed-off-by: Simon Zuberek * Fixes test cases Signed-off-by: Simon Zuberek --------- Signed-off-by: Simon Zuberek Co-authored-by: Simon Zuberek Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- .../de/data/electronic/domain.tsv | 2 +- .../de/taggers/electronic.py | 47 ++++++++++++------- .../text_normalization/de/taggers/time.py | 28 +++++------ .../de/taggers/tokenize_and_classify.py | 10 ++-- .../de/verbalizers/electronic.py | 13 ++--- .../test_cases_date.txt | 1 - .../test_cases_decimal.txt | 5 +- .../test_cases_electronic.txt | 8 +++- .../test_cases_time.txt | 4 +- ..._sparrowhawk_inverse_text_normalization.sh | 2 +- .../de/test_sparrowhawk_normalization.sh | 2 +- 12 files changed, 73 insertions(+), 51 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 6822ee055..6f3c3780a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -12,7 +12,7 @@ pipeline { environment { AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0' - DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' + DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0' EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-18-23-0' ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0' ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-2' diff --git a/nemo_text_processing/text_normalization/de/data/electronic/domain.tsv b/nemo_text_processing/text_normalization/de/data/electronic/domain.tsv index 5738cd66c..1950e719a 100644 --- a/nemo_text_processing/text_normalization/de/data/electronic/domain.tsv +++ b/nemo_text_processing/text_normalization/de/data/electronic/domain.tsv @@ -1,7 +1,7 @@ .com punkt com .uk punkt uk .fr punkt fr -.net dot net +.net punkt net .br punkt br .in punkt in .ru punkt ru diff --git a/nemo_text_processing/text_normalization/de/taggers/electronic.py b/nemo_text_processing/text_normalization/de/taggers/electronic.py index 8bca4646b..85beb41a2 100644 --- a/nemo_text_processing/text_normalization/de/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/de/taggers/electronic.py @@ -35,31 +35,44 @@ def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="classify", deterministic=deterministic) dot = pynini.accep(".") - accepted_common_domains = [x[0] for x in load_labels(get_abs_path("data/electronic/domain.tsv"))] - accepted_common_domains = pynini.union(*accepted_common_domains) - accepted_symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))] - accepted_symbols = pynini.union(*accepted_symbols) - dot - accepted_characters = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols) + + symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))] + symbols = pynini.union(*symbols) + # all symbols + symbols_no_period = pynini.difference(symbols, dot) # alphabet of accepted symbols excluding the '.' + accepted_characters = pynini.closure( + (NEMO_ALPHA | NEMO_DIGIT | symbols_no_period), 1 + ) # alphabet of accepted chars excluding the '.' + all_characters = pynini.closure( + (NEMO_ALPHA | NEMO_DIGIT | symbols), 1 + ) # alphabet of accepted chars including the '.' + + # domains + domain = dot + accepted_characters + domain_graph = ( + pynutil.insert('domain: "') + (accepted_characters + pynini.closure(domain, 1)) + pynutil.insert('"') + ) # email - username = pynutil.insert("username: \"") + accepted_characters + pynutil.insert("\"") + pynini.cross('@', ' ') - domain_graph = accepted_characters + dot + accepted_characters - domain_graph = pynutil.insert("domain: \"") + domain_graph + pynutil.insert("\"") - domain_common_graph = ( - pynutil.insert("domain: \"") - + accepted_characters - + accepted_common_domains - + pynini.closure((accepted_symbols | dot) + pynini.closure(accepted_characters, 1), 0, 1) - + pynutil.insert("\"") + username = pynutil.insert('username: "') + all_characters + pynutil.insert('"') + pynini.cross("@", " ") + email = username + domain_graph + + # social media tags + tag = ( + pynini.cross("@", "") + + pynutil.insert('username: "') + + (accepted_characters | (accepted_characters + pynini.closure(domain, 1))) + + pynutil.insert('"') ) - graph = (username + domain_graph) | domain_common_graph # url protocol_start = pynini.accep("https://") | pynini.accep("http://") protocol_end = pynini.accep("www.") protocol = protocol_start | protocol_end | (protocol_start + protocol_end) - protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert("\"") - graph |= protocol + insert_space + (domain_graph | domain_common_graph) + protocol = pynutil.insert('protocol: "') + protocol + pynutil.insert('"') + url = protocol + insert_space + (domain_graph) + + graph = url | domain_graph | email | tag self.graph = graph final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true")) diff --git a/nemo_text_processing/text_normalization/de/taggers/time.py b/nemo_text_processing/text_normalization/de/taggers/time.py index e9088a4a1..371ad16ac 100644 --- a/nemo_text_processing/text_normalization/de/taggers/time.py +++ b/nemo_text_processing/text_normalization/de/taggers/time.py @@ -27,7 +27,7 @@ class TimeFst(GraphFst): "2 Uhr" -> time { hours: "2" } "09:00 Uhr" -> time { hours: "2" } "02:15:10 Uhr" -> time { hours: "2" minutes: "15" seconds: "10"} - + Args: deterministic: if True will provide a single transduction option, for False multiple transduction are generated (used for audio-based normalization) @@ -43,37 +43,35 @@ def __init__(self, deterministic: bool = True): labels_minute_single = [str(x) for x in range(1, 10)] labels_minute_double = [str(x) for x in range(10, 60)] - delete_leading_zero_to_double_digit = (pynutil.delete("0") | (NEMO_DIGIT - "0")) + NEMO_DIGIT + delete_leading_zero_to_double_digit = (pynutil.delete("0").ques | (NEMO_DIGIT - "0")) + NEMO_DIGIT graph_hour = pynini.union(*labels_hour) graph_minute_single = pynini.union(*labels_minute_single) graph_minute_double = pynini.union(*labels_minute_double) - final_graph_hour_only = pynutil.insert("hours: \"") + graph_hour + pynutil.insert("\"") + final_graph_hour_only = pynutil.insert('hours: "') + graph_hour + pynutil.insert('"') final_graph_hour = ( - pynutil.insert("hours: \"") + delete_leading_zero_to_double_digit @ graph_hour + pynutil.insert("\"") + pynutil.insert('hours: "') + delete_leading_zero_to_double_digit @ graph_hour + pynutil.insert('"') ) final_graph_minute = ( - pynutil.insert("minutes: \"") + pynutil.insert('minutes: "') + (pynutil.delete("0") + graph_minute_single | graph_minute_double) - + pynutil.insert("\"") + + pynutil.insert('"') ) final_graph_second = ( - pynutil.insert("seconds: \"") + pynutil.insert('seconds: "') + (pynutil.delete("0") + graph_minute_single | graph_minute_double) - + pynutil.insert("\"") + + pynutil.insert('"') ) final_time_zone_optional = pynini.closure( - pynini.accep(" ") + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), - 0, - 1, + pynini.accep(" ") + pynutil.insert('zone: "') + convert_space(time_zone_graph) + pynutil.insert('"'), 0, 1, ) - # 02:30 Uhr + # Accepts the following formats: 02:30 Uhr, 02.30 Uhr, 2:30 Uhr, 2.30 Uhr graph_hm = ( final_graph_hour - + pynutil.delete(":") + + (pynutil.delete(":") | pynutil.delete(".")) + (pynutil.delete("00") | (insert_space + final_graph_minute)) + final_suffix + final_time_zone_optional @@ -83,9 +81,9 @@ def __init__(self, deterministic: bool = True): graph_hms = ( final_graph_hour + pynutil.delete(":") - + (pynini.cross("00", " minutes: \"0\"") | (insert_space + final_graph_minute)) + + (pynini.cross("00", ' minutes: "0"') | (insert_space + final_graph_minute)) + pynutil.delete(":") - + (pynini.cross("00", " seconds: \"0\"") | (insert_space + final_graph_second)) + + (pynini.cross("00", ' seconds: "0"') | (insert_space + final_graph_second)) + final_suffix + final_time_zone_optional + pynutil.insert(" preserve_order: true") diff --git a/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py index bfcc295b6..e6590536f 100644 --- a/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py @@ -70,7 +70,7 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"_{input_case}_de_tn_{deterministic}_deterministic{whitelist_file}.far" + cache_dir, f"_{input_case}_de_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -92,7 +92,7 @@ def __init__( self.fraction = FractionFst(cardinal=self.cardinal, deterministic=deterministic) fraction_graph = self.fraction.fst self.measure = MeasureFst( - cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic + cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic, ) measure_graph = self.measure.fst self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic) @@ -104,7 +104,7 @@ def __init__( telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst - self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic) + self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = self.whitelist.fst @@ -121,7 +121,7 @@ def __init__( | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) - | pynutil.add_weight(electronic_graph, 1.1) + | pynutil.add_weight(electronic_graph, 1.11) ) classify |= pynutil.add_weight(word_graph, 100) @@ -132,7 +132,7 @@ def __init__( pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) ) - graph = token_plus_punct + pynini.closure(pynutil.add_weight(delete_extra_space, 1.1) + token_plus_punct) + graph = token_plus_punct + pynini.closure((delete_extra_space).ques + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/de/verbalizers/electronic.py b/nemo_text_processing/text_normalization/de/verbalizers/electronic.py index b3c2a378f..7e825f978 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/electronic.py @@ -19,6 +19,7 @@ from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, NEMO_SIGMA, + NEMO_SPACE, GraphFst, delete_preserve_order, insert_space, @@ -55,21 +56,21 @@ def add_space_after_char(): verbalize_characters = pynini.cdrewrite(graph_symbols | graph_digit, "", "", NEMO_SIGMA) - user_name = pynutil.delete("username: \"") + add_space_after_char() + pynutil.delete("\"") + user_name = pynutil.delete('username: "') + add_space_after_char() + pynutil.delete('"') user_name @= verbalize_characters convert_defaults = pynutil.add_weight(NEMO_NOT_QUOTE, weight=0.0001) | domain_common | server_common domain = convert_defaults + pynini.closure(insert_space + convert_defaults) domain @= verbalize_characters - domain = pynutil.delete("domain: \"") + domain + pynutil.delete("\"") + domain = pynutil.delete('domain: "') + domain + pynutil.delete('"') protocol = ( - pynutil.delete("protocol: \"") + pynutil.delete('protocol: "') + add_space_after_char() @ pynini.cdrewrite(graph_symbols, "", "", NEMO_SIGMA) - + pynutil.delete("\"") + + pynutil.delete('"') ) - self.graph = (pynini.closure(protocol + pynini.accep(" "), 0, 1) + domain) | ( - user_name + pynini.accep(" ") + pynutil.insert("at ") + domain + self.graph = (pynini.closure(protocol + NEMO_SPACE, 0, 1) + domain) | ( + user_name + NEMO_SPACE + pynutil.insert("at ") + domain | (pynutil.insert("at ") + user_name) ) delete_tokens = self.delete_tokens(self.graph + delete_preserve_order) self.fst = delete_tokens.optimize() diff --git a/tests/nemo_text_processing/de/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/de/data_text_normalization/test_cases_date.txt index 7de5cb7d7..d1b58fe80 100644 --- a/tests/nemo_text_processing/de/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/de/data_text_normalization/test_cases_date.txt @@ -3,7 +3,6 @@ vierzehnter januar~14. januar erster januar~1. januar dreißigster juni~30. juni zweiter märz zwei tausend drei~02.03.2003 -zweiter märz zwei tausend drei~02.03.2003 zweiter märz zwei tausend drei~2.3.2003 zweiter märz~2.3 zweiter märz~02.03 diff --git a/tests/nemo_text_processing/de/data_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/de/data_text_normalization/test_cases_decimal.txt index 551114d8e..e8bde3941 100644 --- a/tests/nemo_text_processing/de/data_text_normalization/test_cases_decimal.txt +++ b/tests/nemo_text_processing/de/data_text_normalization/test_cases_decimal.txt @@ -4,4 +4,7 @@ vier hundert sechzig millionen~460 millionen ein hundert zwanzig millionen~120 millionen zehn millionen~10 millionen minus sechzig komma zwei vier null null~-60,2400 -acht hundert achtzehn komma drei null drei~818,303 \ No newline at end of file +acht hundert achtzehn komma drei null drei~818,303 +eins , zwei komma drei~1,2,3 +eins komma zwei , drei komma vier~1,2,3,4 +eins , zwei komma drei , vier komma fünf~1,2,3,4,5 \ No newline at end of file diff --git a/tests/nemo_text_processing/de/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/de/data_text_normalization/test_cases_electronic.txt index fc9b0fc66..02e32b9ac 100644 --- a/tests/nemo_text_processing/de/data_text_normalization/test_cases_electronic.txt +++ b/tests/nemo_text_processing/de/data_text_normalization/test_cases_electronic.txt @@ -6,4 +6,10 @@ a eins b zwei at a b c punkt com~a1b2@abc.com a b drei bindestrich s d d bindestrich drei at g mail punkt com~ab3-sdd-3@gmail.com h t t p s doppelpunkt slash slash w w w punkt a b c punkt com~https://www.abc.com w w w punkt a b c punkt com~www.abc.com -h t t p s doppelpunkt slash slash w w w punkt a b c punkt com slash a b fragezeichen gleichheitszeichen drei bindestrich slash a b s slash eins~https://www.abc.com/ab?=3-/abs/1 \ No newline at end of file +b r e t t s p i e l v e r s a n d punkt de .~brettspielversand.de. +w w w punkt e n v e e d y a punkt net .~www.enveedya.net. +w w w punkt a m a z o n punkt com punkt de .~www.amazon.com.de. +h t t p s doppelpunkt slash slash w w w punkt a b c punkt com slash a b fragezeichen gleichheitszeichen drei bindestrich slash a b s slash eins~https://www.abc.com/ab?=3-/abs/1 +at j e n s e n~@jensen +at j e n s e n punkt m e~@jensen.me +at w e z y r eins neun acht sechs~@wezyr1986 \ No newline at end of file diff --git a/tests/nemo_text_processing/de/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/de/data_text_normalization/test_cases_time.txt index 2b5216d20..ad6c7d7bd 100644 --- a/tests/nemo_text_processing/de/data_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/de/data_text_normalization/test_cases_time.txt @@ -23,4 +23,6 @@ vier und zwanzig uhr fünf und vierzig~24:45 Uhr vier und zwanzig uhr fünfzehn~24:15 Uhr null uhr null minuten null sekunden~00:00:00 Uhr ein uhr eine minute eine sekunde e s t~01:01:01 Uhr est -zwei uhr zwei minuten drei und zwanzig sekunden~02:02:23 Uhr \ No newline at end of file +zwei uhr zwei minuten drei und zwanzig sekunden~02:02:23 Uhr +zwei uhr dreißig~2.30 Uhr +zwei uhr dreißig~02.30 Uhr \ No newline at end of file diff --git a/tests/nemo_text_processing/de/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/de/test_sparrowhawk_inverse_text_normalization.sh index 634603c49..fa24eb640 100644 --- a/tests/nemo_text_processing/de/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/de/test_sparrowhawk_inverse_text_normalization.sh @@ -3,7 +3,7 @@ PROJECT_DIR=/workspace/tests GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} -PROJECT_DIR=${2:-"/workspace/tests/en"} +PROJECT_DIR=${2:-"/workspace/tests/"} runtest () { input=$1 diff --git a/tests/nemo_text_processing/de/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/de/test_sparrowhawk_normalization.sh index 4f38d8c47..ac40cbae7 100644 --- a/tests/nemo_text_processing/de/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/de/test_sparrowhawk_normalization.sh @@ -1,7 +1,7 @@ #! /bin/sh GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} -PROJECT_DIR=${2:-"/workspace/tests/en"} +PROJECT_DIR=${2:-"/workspace/tests/"} runtest () { input=$1 From df22a189afbcd1544c147d1ab060176918e303ce Mon Sep 17 00:00:00 2001 From: Mariana <47233618+mgrafu@users.noreply.github.com> Date: Fri, 7 Jun 2024 12:32:30 -0400 Subject: [PATCH 24/90] Tts en tech terms (#167) * update tts whitelist Signed-off-by: Mariana Graterol Fuenmayor * enable normalization of emphasized input Signed-off-by: Mariana Graterol Fuenmayor * add whitelist terms Signed-off-by: Mariana Graterol Fuenmayor * add test for emphasis Signed-off-by: Mariana Graterol Fuenmayor * read card numbers as digits Signed-off-by: Mariana Graterol Fuenmayor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * make ccs deterministic Signed-off-by: Mariana Graterol Fuenmayor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update jenkins Signed-off-by: Mariana Graterol Fuenmayor * fix sh tests bug Signed-off-by: Mariana Graterol Fuenmayor * fix bug with time Signed-off-by: Mariana Graterol Fuenmayor * update jenkins Signed-off-by: Mariana Graterol Fuenmayor * fix sh time bug Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- .../en/data/electronic/cc_cues.tsv | 5 + .../en/data/whitelist/tts.tsv | 93 +++++++++++++------ .../en/taggers/electronic.py | 14 +++ .../en/taggers/punctuation.py | 2 +- .../en/verbalizers/post_processing.py | 2 +- .../text_normalization/en/verbalizers/time.py | 8 +- .../test_cases_electronic.txt | 1 + .../test_cases_serial.txt | 2 +- .../test_cases_time.txt | 2 + .../test_cases_whitelist.txt | 1 + .../test_cases_word.txt | 1 + tools/text_processing_deployment/sh_test.sh | 4 +- 13 files changed, 100 insertions(+), 37 deletions(-) create mode 100644 nemo_text_processing/text_normalization/en/data/electronic/cc_cues.tsv diff --git a/Jenkinsfile b/Jenkinsfile index 6f3c3780a..41bf70fa4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -13,7 +13,7 @@ pipeline { AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0' DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0' - EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-18-23-0' + EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-06-24-0' ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0' ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-2' FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/12-05-23-0' diff --git a/nemo_text_processing/text_normalization/en/data/electronic/cc_cues.tsv b/nemo_text_processing/text_normalization/en/data/electronic/cc_cues.tsv new file mode 100644 index 000000000..c42d25f13 --- /dev/null +++ b/nemo_text_processing/text_normalization/en/data/electronic/cc_cues.tsv @@ -0,0 +1,5 @@ +card ending in +credit card number +credit card +debit card number +debit card \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv b/nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv index e81924755..758691676 100644 --- a/nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv +++ b/nemo_text_processing/text_normalization/en/data/whitelist/tts.tsv @@ -98,7 +98,8 @@ b. c. BC b.c. BC A. A. a a A.A. AA -A&A A and A +A&A AANDA +A100 AONEHUNDRED AAAs AAA's AaB AAB Aabo AABO @@ -117,7 +118,7 @@ Abp ABP Abr ABR abv ABV Abz ABZ -A&C A and C +A&C AANDC A. C. AC A.C. AC A/C AC @@ -131,12 +132,12 @@ ACMs ACM's AcpA ACPA AcpB ACPB acq ACQ -A&CR A and CR +A&CR AANDCR AcSOC ACSOC acu ACU AC&W AC and W Acy ACY -A&D A and D +A&D AANDD Adab ADAB A. D. AD A.D. AD @@ -157,7 +158,7 @@ ADRs ADR's ADSRs ADSR's ADSs ADS's Adwa ADWA -A&E A and E +A&E AANDE A. E. AE A.E. AE Aed AED @@ -220,7 +221,7 @@ Ahta AHTA aht AHT Ahu AHU Ahva AHVA -A&I A and I +A&I AANDI A. I. AI A.I. AI Aias AIA's @@ -251,7 +252,7 @@ aka. AKA aka AKA Aka. AKA Aka AKA -A&K A and K +A&K AANDK A. K. AK A.K. AK AKAPs AKAP's @@ -260,13 +261,13 @@ AKAs AKA's akh AKH Akpa AKPA Aku AKU -A&L A and L +A&L AANDL A. L. AL A.L. AL Alh ALH ALUs ALU's A'ma AMA -A&M A and M +A&M AANDM a.m. AM a.m AM A. M. AM @@ -287,7 +288,7 @@ Amm AMM AM&O AM and O Ampt AMPT amr AMR -A&M's A and M's +A&M's AANDM's Ams' AM's amsl AMSL Amta AMTA @@ -305,7 +306,7 @@ Angu ANGU Anhe ANHE Anr ANR Anrep ANREP -A&O A and O +A&O AANDO A. O. AO A.O. AO AOCCs AOCC's @@ -326,7 +327,7 @@ AoS AOS Aotus AOTU's aov AOV aovf AOVF -A&P A and P +A&P AANDP A. P. AP A.P. AP Apc APC @@ -347,14 +348,14 @@ appr APPR Appts APPT's appu APPU Appu APPU -A&P's A and P's +A&P's AANDP's Aps AP's APs AP's AP&T AP and T Aqa AQA Aql AQL Aqr AQR -A&R A and R +A&R AANDR A. R. AR A.R. AR ArgR ARGR @@ -389,7 +390,7 @@ Asst ASST Aste ASTE Astt ASTT Aswa ASWA -A&T A and T +A&T AANDT atac ATAC Atac ATAC Atad ATAD @@ -422,12 +423,12 @@ ATOs ATO's atpB ATPB atri ATRI Atri ATRI -A&T's A and T's +A&T's AANDT's A.T.s AT's AT&SF AT and SF -AT&T AT and T +AT&T ATANDT attd ATTD -AT&T's AT and T's +AT&T's ATANDTS atv ATV ATVs ATV's AtxA ATXA @@ -643,7 +644,7 @@ Bzyb BZYB Caat CAAT C. A. CA C.A. CA -C&A C and A +C&A CANDA CA&CC CA and CC Cadw CADW caeca CAECA @@ -893,7 +894,7 @@ czy CZY Daai DAAI D. A. DA D.A. DA -D&AD D and AD +D&AD DANDAD D.A.N.C.E. dance Dav DAV dBa DBA @@ -1511,7 +1512,7 @@ Geu GEU Gev GEV GeV GEV Gfa GFA -GF&A GF and A +GF&A GFANDA G. F. GF G.F. GF G. G. GG @@ -2052,7 +2053,7 @@ JHSVs JHSV's JHud JHUD J. I. JI J.I. JI -J&J J and J +J&J JANDJ J. J. JJ J.J. JJ Jka JKA @@ -2107,6 +2108,7 @@ Jym JYM Jymn JYMN J. Z. JZ J.Z. JZ +K8S KUBERNETES K. A. KA K.A. KA kbi KBI @@ -2323,6 +2325,7 @@ L.Y. LY L&YR L and YR M. A. MA M.A. MA +MAX-Q MAXQ M&A M and A Mbewu MBEWU mbi MBI @@ -2396,7 +2399,7 @@ M&L M and L M. L. ML M.L. ML MLPs MLP's -MM&A MM and A +MM&A MMANDA M&M M and M M. M. MM M.M. MM @@ -2553,6 +2556,7 @@ N.W. NW nyc NYC N. Y. NY N.Y. NY +NX-SOC NXSOC N'Zif NZIF NZiK NZIK N'Zi NZI @@ -2785,9 +2789,9 @@ Pyw PYW Pyx PYX P. Z. PZ Q. A. QA -Q&A Q and A -Q&A's Q and A's -Q&As Q and A's +Q&A QANDA +Q&A's QANDA's +Q&As QANDAS QbA QBA Q.B. QB Q.C. QC @@ -2923,6 +2927,7 @@ rup RUP Rup RUP R. U. RU R.U. RU +RUN:AI RUN AI RutB RUTB Ruu RUU Ruwa RUWA @@ -2995,7 +3000,7 @@ S. F. SF S.F. SF sfs SFS S.F.'s SF's -SG&A SG and A +SG&A SGANDA sgb SGB Sgip SGIP sgml SGML @@ -3093,6 +3098,7 @@ Ssy SSY ST&AJ ST and AJ STDs STD's stfv STFV +STG-TWO STGTWO STGs STG's STIs STI's STi STI @@ -3849,3 +3855,36 @@ Z. Y. ZY Z.Y. ZY Z. Z. ZZ Z.Z. ZZ +2.5G two point five g +7-eleven seven eleven +AAA triple a +C# c sharp +DeepMind deep mind +DeepStream deep stream +DevOps dev ops +DisplayPort display port +HuggingFace hugging face +ServiceNow service now +SuperCloud super cloud +Wi-Fi wifi +7-Eleven seven eleven +Deepmind deep mind +Deepstream deep stream +Devops dev ops +Displayport display port +Huggingface hugging face +Servicenow service now +Supercloud super cloud +wi-fi wifi +deepmind deep mind +deepstream deep stream +devops dev ops +displayport display port +huggingface hugging face +servicenow service now +supercloud super cloud +wi-fi wifi +3-D three d +401k four oh one k +401(k) four oh one k +401 (k) four oh one k \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/en/taggers/electronic.py b/nemo_text_processing/text_normalization/en/taggers/electronic.py index 22975ab95..4d6f0e6ce 100644 --- a/nemo_text_processing/text_normalization/en/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/en/taggers/electronic.py @@ -49,6 +49,8 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): else: numbers = pynutil.insert(" ") + cardinal.long_numbers + pynutil.insert(" ") + cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT) + accepted_symbols = pynini.project(pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input") accepted_common_domains = pynini.project( pynini.string_file(get_abs_path("data/electronic/domain.tsv")), "input" @@ -118,6 +120,18 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): # www.abc.com/sdafsdf, or https://www.abc.com/asdfad or www.abc.abc/asdfad graph |= protocol + pynutil.insert(" ") + domain_graph_with_class_tags + if deterministic: + # credit card cues + numbers = pynini.closure(NEMO_DIGIT, 4, 16) + cc_phrases = ( + pynutil.insert("protocol: \"") + + cc_cues + + pynutil.insert("\" domain: \"") + + numbers + + pynutil.insert("\"") + ) + graph |= cc_phrases + final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/en/taggers/punctuation.py b/nemo_text_processing/text_normalization/en/taggers/punctuation.py index 56d2cdcb2..eb7d7e046 100644 --- a/nemo_text_processing/text_normalization/en/taggers/punctuation.py +++ b/nemo_text_processing/text_normalization/en/taggers/punctuation.py @@ -43,7 +43,7 @@ def __init__(self, deterministic: bool = True): chr(i) for i in range(sys.maxunicode) if category(chr(i)).startswith("P") and chr(i) not in punct_symbols_to_exclude - ] + ] + ["\[", "\]"] whitelist_symbols = load_labels(get_abs_path("data/whitelist/symbol.tsv")) whitelist_symbols = [x[0] for x in whitelist_symbols] diff --git a/nemo_text_processing/text_normalization/en/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/en/verbalizers/post_processing.py index 85d0693cf..2269b9e17 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/post_processing.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/post_processing.py @@ -109,7 +109,7 @@ def get_punct_postprocess_graph(self): # no_space_before_punct assume no space before them quotes = ["'", "\"", "``", "«"] dashes = ["-", "—"] - brackets = ["<", "{", "("] + brackets = ["<", "{", "(", "\["] open_close_single_quotes = [ ("`", "`"), ] diff --git a/nemo_text_processing/text_normalization/en/verbalizers/time.py b/nemo_text_processing/text_normalization/en/verbalizers/time.py index 34503eb1d..ff37fd213 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/time.py @@ -86,12 +86,12 @@ def __init__(self, deterministic: bool = True): + optional_suffix + optional_zone ) + graph_hms @= pynini.cdrewrite(pynini.cross("one hours", "one hour"), "[BOS]", "", NEMO_SIGMA) graph_hms @= pynini.cdrewrite( pynutil.delete("o ") - | pynini.cross("one minutes", "one minute") - | pynini.cross("one seconds", "one second") - | pynini.cross("one hours", "one hour"), - pynini.union(" ", "[BOS]"), + | pynutil.add_weight(pynini.cross("o one minutes", "one minute"), -0.01) + | pynutil.add_weight(pynini.cross("o one seconds", "one second"), -0.01), + " ", "", NEMO_SIGMA, ) diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt index 0e930452a..9892a9fe5 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt @@ -38,3 +38,4 @@ rtxprohelp@exchange.nvidia.com~RTX pro help at exchange dot NVIDIA dot com enterpriseservices@nvidia.com~enterprise services at NVIDIA dot com enterprise-services@nvidia.com~enterprise dash services at NVIDIA dot com https://www.nvidia.com/dgx-basepod/~HTTPS colon slash slash WWW dot NVIDIA dot com slash DGX dash BASEPOD slash +i can use your card ending in 8876~i can use your card ending in eight eight seven six \ No newline at end of file diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_serial.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_serial.txt index 04a03e07e..f0a6e0a3f 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_serial.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_serial.txt @@ -10,7 +10,7 @@ MIG-25/235212-asdg~MIG-twenty five/two three five two one two-asdg 1/f-4s~one/f-four s 1/f~one per F 4s~four S -7-eleven~seven-eleven +7-eleven~seven eleven 2x~two x 31/31/100~thirty one/thirty one/one hundred 1-8090~one - eight thousand and ninety diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_time.txt index e7f1d53e4..36e05d287 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_time.txt @@ -17,3 +17,5 @@ 14:10:30~fourteen hours ten minutes and thirty seconds 2pm-5pm~two PM to five PM 5pm~five PM +21:51:31~twenty one hours fifty one minutes and thirty one seconds +01:01:01~one hour one minute and one second \ No newline at end of file diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_whitelist.txt index 7fafb9d77..8ba5bfd9b 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_whitelist.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_whitelist.txt @@ -4,3 +4,4 @@ DNA is~DNA is C. S. Lewis~CS Lewis tv~TV and/or~and/or +Our company offers a 401(k) retirement savings plan~Our company offers a four oh one k retirement savings plan \ No newline at end of file diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_word.txt index c1bd3b58a..b83ad7c31 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_word.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_word.txt @@ -36,3 +36,4 @@ mar~mar /$€₩£BB¥#%AA and $€₩£¥#%~slash dollar euro won pound BB yen hash percent AA and dollar euro won pound yen hash percent love him while we may,~love him while we may, mar~mar +i saw (22) ducklings~i saw (twenty two) ducklings \ No newline at end of file diff --git a/tools/text_processing_deployment/sh_test.sh b/tools/text_processing_deployment/sh_test.sh index 2f5dd9e81..ec19d903e 100644 --- a/tools/text_processing_deployment/sh_test.sh +++ b/tools/text_processing_deployment/sh_test.sh @@ -57,8 +57,8 @@ bash export_grammars.sh --MODE="export" --GRAMMARS=$GRAMMARS --LANGUAGE=$LANGUAG --FAR_PATH=$FAR_PATH --CACHE_DIR=$CACHE_DIR --OVERWRITE_CACHE=$OVERWRITE_CACHE \ --FORCE_REBUILD=$FORCE_REBUILD $WHITELIST -CLASSIFY_FAR=${CACHE_DIR}"/classify/tokenize_and_classify.far" -VERBALIZE_FAR=${CACHE_DIR}"/verbalize/verbalize.far" +CLASSIFY_FAR=${CACHE_DIR}_${GRAMMARS}_${INPUT_CASE}/classify/tokenize_and_classify.far +VERBALIZE_FAR=${CACHE_DIR}_${GRAMMARS}_${INPUT_CASE}/verbalize/verbalize.far CONFIG=${LANGUAGE}_${GRAMMARS}_${INPUT_CASE} From 1318648d83a877ef6e5fa57a0b0434b5b334f799 Mon Sep 17 00:00:00 2001 From: Simon Zuberek Date: Fri, 7 Jun 2024 12:32:46 -0400 Subject: [PATCH 25/90] Normalizes the '%' sign (#180) Signed-off-by: Simon Zuberek Co-authored-by: Simon Zuberek Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- .../text_normalization/it/data/measure/measurements.tsv | 3 ++- .../it/data_text_normalization/test_cases_measure.txt | 3 ++- .../nemo_text_processing/it/test_sparrowhawk_normalization.sh | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 41bf70fa4..ba2ddfdab 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -23,7 +23,7 @@ pipeline { VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-30-24-0' - IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-26-23-0' + IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0' HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' diff --git a/nemo_text_processing/text_normalization/it/data/measure/measurements.tsv b/nemo_text_processing/text_normalization/it/data/measure/measurements.tsv index 68b808dcd..bb359e6a0 100644 --- a/nemo_text_processing/text_normalization/it/data/measure/measurements.tsv +++ b/nemo_text_processing/text_normalization/it/data/measure/measurements.tsv @@ -60,4 +60,5 @@ l litro dl decilitro bar bar kcal chilocaloria -cal caloria \ No newline at end of file +cal caloria +% percento \ No newline at end of file diff --git a/tests/nemo_text_processing/it/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/it/data_text_normalization/test_cases_measure.txt index 6595ccd37..aa1e945d0 100644 --- a/tests/nemo_text_processing/it/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/it/data_text_normalization/test_cases_measure.txt @@ -3,4 +3,5 @@ 4,3 l~quattro virgola tre litri 5 km/s~cinque chilometri per secondo 15 A~quindici ampere -155 d~cento cinquantacinque giorni \ No newline at end of file +155 d~cento cinquantacinque giorni +il 18% delle emissioni di carbonio~il diciotto percento delle emissioni di carbonio \ No newline at end of file diff --git a/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh index e43d90353..a49aa1d1d 100644 --- a/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/it/test_sparrowhawk_normalization.sh @@ -1,7 +1,7 @@ #! /bin/sh PGRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} -PROJECT_DIR=${2:-"/workspace/tests/en"} +PROJECT_DIR=${2:-"/workspace/tests/"} runtest () { input=$1 From 7783a1c6c79ce3e7150266a0bab4094ac01a70f4 Mon Sep 17 00:00:00 2001 From: Simon Zuberek Date: Fri, 7 Jun 2024 12:33:30 -0400 Subject: [PATCH 26/90] FR TN Fixes (#181) * Normalizes the '%' sign Signed-off-by: Simon Zuberek * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updates pathing in Jenkins Signed-off-by: Simon Zuberek * Fixes test cases Signed-off-by: Simon Zuberek * More test case fixes Signed-off-by: Simon Zuberek --------- Signed-off-by: Simon Zuberek Co-authored-by: Simon Zuberek Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- .../text_normalization/fr/data/whitelist.tsv | 3 ++- .../fr/taggers/tokenize_and_classify.py | 13 +++---------- .../test_cases_whitelist.txt | 3 ++- 4 files changed, 8 insertions(+), 13 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index ba2ddfdab..11d607ffe 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -16,7 +16,7 @@ pipeline { EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-06-24-0' ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0' ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-2' - FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/12-05-23-0' + FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0' HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' diff --git a/nemo_text_processing/text_normalization/fr/data/whitelist.tsv b/nemo_text_processing/text_normalization/fr/data/whitelist.tsv index f33457ba8..dc563bdab 100644 --- a/nemo_text_processing/text_normalization/fr/data/whitelist.tsv +++ b/nemo_text_processing/text_normalization/fr/data/whitelist.tsv @@ -9,4 +9,5 @@ Dʳᵉˢ docteures apr. J.-C. après jésus-christ av. J.-C. avant Jésus-Christ le hon. l’honorable -le très hon. le très hononrable \ No newline at end of file +le très hon. le très hononrable +% pour cent \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py index 2c2518385..de9a0b047 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py @@ -62,7 +62,7 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"_{input_case}_fr_tn_{deterministic}_deterministic{whitelist_file}.far" + cache_dir, f"_{input_case}_fr_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -79,7 +79,7 @@ def __init__( self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst - self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic) + self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,) fraction_graph = self.fraction.fst word_graph = WordFst(deterministic=deterministic).fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) @@ -105,14 +105,7 @@ def __init__( pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) ) - graph = token_plus_punct + pynini.closure( - ( - pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) - | (pynutil.insert(" ") + punct + pynutil.insert(" ")) - ) - + token_plus_punct - ) - + graph = token_plus_punct + pynini.closure((delete_extra_space).ques + token_plus_punct) graph = delete_space + graph + delete_space graph |= punct diff --git a/tests/nemo_text_processing/fr/data_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/fr/data_text_normalization/test_cases_whitelist.txt index 50997ed9b..6a6b179dc 100644 --- a/tests/nemo_text_processing/fr/data_text_normalization/test_cases_whitelist.txt +++ b/tests/nemo_text_processing/fr/data_text_normalization/test_cases_whitelist.txt @@ -3,4 +3,5 @@ Dʳᵉˢ~docteures Mᵐᵉ~madame Mᵐᵉˢ~mesdames Mˡˡᵉ~mademoiselle -Mˡˡᵉˢ~mademoiselles \ No newline at end of file +Mˡˡᵉˢ~mademoiselles +18%~dix-huit pour cent \ No newline at end of file From 85a771b92707cf3af48eea1ecddf97c71c386948 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Fri, 12 Jul 2024 12:44:46 -0700 Subject: [PATCH 27/90] testing Signed-off-by: Alex Cui --- test.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test.txt diff --git a/test.txt b/test.txt new file mode 100644 index 000000000..e69de29bb From 5bb48724889d795b030956f0f9d55907f4288a6d Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Fri, 12 Jul 2024 12:45:34 -0700 Subject: [PATCH 28/90] removing test.txt Signed-off-by: Alex Cui --- test.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test.txt diff --git a/test.txt b/test.txt deleted file mode 100644 index e69de29bb..000000000 From 72341b15cf386036377557d53bbfff6706574eeb Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Mon, 15 Jul 2024 12:47:41 -0700 Subject: [PATCH 29/90] fixing zh tn money curreny on l Signed-off-by: Alex Cui --- .../text_normalization/zh/taggers/money.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 786319627..f5f40fb3e 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -19,7 +19,6 @@ from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.text_normalization.zh.utils import get_abs_path -# def get_quantity(decimal): suffix = pynini.union( "万", "十万", @@ -107,7 +106,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa # larger money as decimals graph_decimal = ( pynutil.insert('integer_part: \"') - + pynini.closure( + + ( pynini.closure(cardinal, 1) + pynutil.delete('.') + pynutil.insert('点') @@ -117,24 +116,23 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa ) graph_decimal_money = ( pynini.closure(graph_decimal, 1) - + pynini.closure(pynutil.insert(' quantity: \"') + suffix + pynutil.insert('\"')) + + pynini.closure((pynutil.insert(' quantity: \"') + suffix + pynutil.insert('\"')), 0, 1) + pynutil.insert(" ") + pynini.closure(currency_mandarin_component, 1) ) | ( pynini.closure(currency_component, 1) - + pynutil.insert(" ") + pynini.closure(graph_decimal, 1) - + pynini.closure(pynutil.insert(" ") + pynutil.insert('quantity: \"') + suffix + pynutil.insert('\"')) + + pynini.closure( + (pynutil.insert(" ") + pynutil.insert('quantity: \"') + suffix + pynutil.insert('\"')), 0, 1 + ) ) - graph = ( + final_graph = ( graph_regular_money | graph_units | pynutil.add_weight(graph_mandarin_money, -3.0) | pynutil.add_weight(graph_decimal_money, -1.0) ) - final_graph = graph - final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() From 14ff392d02284fe12f2348129b27e481569bc2d6 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Mon, 15 Jul 2024 15:09:49 -0700 Subject: [PATCH 30/90] bug fix on money currency l Signed-off-by: Alex Cui --- tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh index dd352b42b..5089427b6 100644 --- a/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh @@ -1,7 +1,7 @@ #! /bin/sh GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} -PROJECT_DIR=${2:-"/workspace/tests/en"} +PROJECT_DIR=${2:-"/workspace/tests"} runtest () { input=$1 From 694c33b8c89aed21d1ec88c144e6f546092221f0 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Mon, 15 Jul 2024 16:12:55 -0700 Subject: [PATCH 31/90] updates for zh tn Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 11d607ffe..7642e0912 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-30-24-0' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-15-24-0' IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0' HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' From 3152b3559176467147e86b8efc39cbae1b9fade5 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Tue, 16 Jul 2024 09:23:10 -0700 Subject: [PATCH 32/90] resolving failed ci tests for money grammar Signed-off-by: Alex Cui --- nemo_text_processing/text_normalization/zh/taggers/money.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index f5f40fb3e..607a968dc 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -121,6 +121,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa + pynini.closure(currency_mandarin_component, 1) ) | ( pynini.closure(currency_component, 1) + + pynutil.insert(" ") + pynini.closure(graph_decimal, 1) + pynini.closure( (pynutil.insert(" ") + pynutil.insert('quantity: \"') + suffix + pynutil.insert('\"')), 0, 1 @@ -134,5 +135,5 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa | pynutil.add_weight(graph_decimal_money, -1.0) ) - final_graph = self.add_tokens(final_graph) + final_graph = self.add_tokens(graph_decimal_money) self.fst = final_graph.optimize() From d619eca7c7aef7b3ba3f6c421ddb102f2f5dc04a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 16:23:50 +0000 Subject: [PATCH 33/90] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo_text_processing/text_normalization/zh/taggers/money.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 607a968dc..b05097138 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -121,7 +121,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa + pynini.closure(currency_mandarin_component, 1) ) | ( pynini.closure(currency_component, 1) - + pynutil.insert(" ") + + pynutil.insert(" ") + pynini.closure(graph_decimal, 1) + pynini.closure( (pynutil.insert(" ") + pynutil.insert('quantity: \"') + suffix + pynutil.insert('\"')), 0, 1 From f7416a163ebcf9612beec53e00976689e8d375c6 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Tue, 16 Jul 2024 17:02:45 -0700 Subject: [PATCH 34/90] updates for decimal maoney failure Signed-off-by: Alex Cui --- nemo_text_processing/text_normalization/zh/taggers/money.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 607a968dc..c48013d2c 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -135,5 +135,6 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa | pynutil.add_weight(graph_decimal_money, -1.0) ) - final_graph = self.add_tokens(graph_decimal_money) + #import pdb; pdb.set_trace() + final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() From b5e6f33396c0e7b959a5bab91a967a1803e7fae0 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Tue, 16 Jul 2024 17:03:05 -0700 Subject: [PATCH 35/90] removing comments Signed-off-by: Alex Cui --- .../zh/taggers/tokenize_and_classify.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py index d35ea178b..c6cd716e0 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py @@ -78,18 +78,18 @@ def __init__( punctuation = PunctuationFst(deterministic=deterministic) classify = pynini.union( - pynutil.add_weight(date.fst, 1.1), - pynutil.add_weight(fraction.fst, 1.0), - pynutil.add_weight(money.fst, 1.1), - pynutil.add_weight(measure.fst, 1.05), - pynutil.add_weight(time.fst, 1.1), - pynutil.add_weight(whitelist.fst, 1.1), - pynutil.add_weight(cardinal.fst, 1.1), - pynutil.add_weight(decimal.fst, 3.05), - pynutil.add_weight(ordinal.fst, 1.1), - pynutil.add_weight(punctuation.fst, 1.0), - pynutil.add_weight(word.fst, 100), - ) + pynutil.add_weight(date.fst, 1.1), + pynutil.add_weight(fraction.fst, 1.0), + pynutil.add_weight(money.fst, 1.1), + pynutil.add_weight(measure.fst, 1.05), + pynutil.add_weight(time.fst, 1.1), + pynutil.add_weight(whitelist.fst, 1.1), + pynutil.add_weight(cardinal.fst, 1.1), + pynutil.add_weight(decimal.fst, 3.05), + pynutil.add_weight(ordinal.fst, 1.1), + pynutil.add_weight(punctuation.fst, 1.0), + pynutil.add_weight(word.fst, 100), + ) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") tagger = pynini.closure(token, 1) From 682988c14096bb15dca7306961ebe1413008f366 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 17 Jul 2024 00:05:30 +0000 Subject: [PATCH 36/90] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/zh/taggers/money.py | 2 +- .../zh/taggers/tokenize_and_classify.py | 24 +++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 60fca3f7d..ea6114efc 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -135,6 +135,6 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa | pynutil.add_weight(graph_decimal_money, -1.0) ) - #import pdb; pdb.set_trace() + # import pdb; pdb.set_trace() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py index c6cd716e0..d35ea178b 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py @@ -78,18 +78,18 @@ def __init__( punctuation = PunctuationFst(deterministic=deterministic) classify = pynini.union( - pynutil.add_weight(date.fst, 1.1), - pynutil.add_weight(fraction.fst, 1.0), - pynutil.add_weight(money.fst, 1.1), - pynutil.add_weight(measure.fst, 1.05), - pynutil.add_weight(time.fst, 1.1), - pynutil.add_weight(whitelist.fst, 1.1), - pynutil.add_weight(cardinal.fst, 1.1), - pynutil.add_weight(decimal.fst, 3.05), - pynutil.add_weight(ordinal.fst, 1.1), - pynutil.add_weight(punctuation.fst, 1.0), - pynutil.add_weight(word.fst, 100), - ) + pynutil.add_weight(date.fst, 1.1), + pynutil.add_weight(fraction.fst, 1.0), + pynutil.add_weight(money.fst, 1.1), + pynutil.add_weight(measure.fst, 1.05), + pynutil.add_weight(time.fst, 1.1), + pynutil.add_weight(whitelist.fst, 1.1), + pynutil.add_weight(cardinal.fst, 1.1), + pynutil.add_weight(decimal.fst, 3.05), + pynutil.add_weight(ordinal.fst, 1.1), + pynutil.add_weight(punctuation.fst, 1.0), + pynutil.add_weight(word.fst, 100), + ) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") tagger = pynini.closure(token, 1) From 840ae1fad15b51a6139681aa198a1ca7caf0e21c Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Tue, 16 Jul 2024 17:55:09 -0700 Subject: [PATCH 37/90] updates on money grammar for failure cases Signed-off-by: Alex Cui --- nemo_text_processing/text_normalization/zh/taggers/money.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 60fca3f7d..9b4778994 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -135,6 +135,5 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa | pynutil.add_weight(graph_decimal_money, -1.0) ) - #import pdb; pdb.set_trace() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() From e7284e6f48e29ebd528aa030b4b8ef26a16c63e9 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Tue, 16 Jul 2024 17:55:30 -0700 Subject: [PATCH 38/90] adding test cases in the nvbug Signed-off-by: Alex Cui --- .../test_cases_word.txt | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt index 23270bf82..74b965e80 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt @@ -4,4 +4,22 @@ 只有智商超过一定数值的人才能破解~只有智商超过一定数值的人才能破解 这是由人工智能控制的系统~这是由人工智能控制的系统 欧洲旅游目的地多到不知道怎么选~欧洲旅游目的地多到不知道怎么选 -马斯科卖掉豪宅住进折叠屋~马斯科卖掉豪宅住进折叠屋 \ No newline at end of file +马斯科卖掉豪宅住进折叠屋~马斯科卖掉豪宅住进折叠屋 +免除GOOGLE在一桩诽谤官司中的法律责任。~免除GOOGLE在一桩诽谤官司中的法律责任。 +这对CHROME是有利的。~这对CHROME是有利的。 +这可能是PILde使用者。~这可能是PILde使用者。 +CSI侧重科学办案,也就是现场搜正和鉴识。~CSI侧重科学办案,也就是现场搜正和鉴识。 +我以前非常喜欢一个软体,DRAW。~我以前非常喜欢一个软体,DRAW。 +我爱你病毒。~我爱你病毒。 +微软举办了RACETOMARKETCHALLENGE竞赛。~微软举办了RACETOMARKETCHALLENGE竞赛。 +苹果销售量的复苏程度远超PC市场。~苹果销售量的复苏程度远超PC市场。 +第三季还有两款ANDROID手机亮相。~第三季还有两款ANDROID手机亮相。 +反而应试著让所有GOOGLE服务更加社交化。~反而应试著让所有GOOGLE服务更加社交化。 +GOOGLE已提供一项NATIVECLIENT软体。~GOOGLE已提供一项NATIVECLIENT软体。 +这些程式都支援PRE与ITUNES同步化。~这些程式都支援PRE与ITUNES同步化。 +可以推断此次NTT可能也会将同样的策略用在LTE上。~可以推断此次NTT可能也会将同样的策略用在LTE上。 +现今许多小型企业因成本考量被迫采用一般PC作为伺服器。~现今许多小型企业因成本考量被迫采用一般PC作为伺服器。 +部落格宣布GOOGLECHROMES的诞生。~部落格宣布GOOGLECHROMES的诞生。 +由ZIP订购机场接送或观光景点共乘服务。~由ZIP订购机场接送或观光景点共乘服务。 +PAQUE表示短时间应该还不会全面开放。~PAQUE表示短时间应该还不会全面开放。 +CBS是美国一家重要的广播电视网路公司。~CBS是美国一家重要的广播电视网路公司。 \ No newline at end of file From 83ba1d74a5c36aac734cab7fbc5aaf5b326929b7 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Tue, 16 Jul 2024 18:01:37 -0700 Subject: [PATCH 39/90] updates for ci etst Signed-off-by: Alex Cui --- nemo_text_processing/text_normalization/zh/taggers/money.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 632d093b8..9b4778994 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -135,9 +135,5 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa | pynutil.add_weight(graph_decimal_money, -1.0) ) -<<<<<<< HEAD -======= - # import pdb; pdb.set_trace() ->>>>>>> 682988c14096bb15dca7306961ebe1413008f366 final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() From 9ebb5ad6b38ff2479239297bf99fbe861f9c91b8 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Tue, 16 Jul 2024 19:26:14 -0700 Subject: [PATCH 40/90] updating date for rerun Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 7642e0912..1cc9bfd61 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-15-24-0' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0' IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0' HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' From 50546b244ecc18e3586657dd227e3e3b99f759ea Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Wed, 17 Jul 2024 16:40:06 -0700 Subject: [PATCH 41/90] renaming final graphs Signed-off-by: Alex Cui --- nemo_text_processing/text_normalization/zh/taggers/money.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 9b4778994..d0c6d2ca5 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -128,12 +128,12 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa ) ) - final_graph = ( + graph = ( graph_regular_money | graph_units | pynutil.add_weight(graph_mandarin_money, -3.0) | pynutil.add_weight(graph_decimal_money, -1.0) ) - final_graph = self.add_tokens(final_graph) + final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() From 63fec6f949833a6090c474add217104a4a1f88f9 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Thu, 18 Jul 2024 10:04:38 -0700 Subject: [PATCH 42/90] conflicts Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 9705cdfdc..176435e6d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-30-24-0' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-17-24-0' IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0' HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' From f461b406ccd635aa7cd2687dc5d785a667e5247e Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Thu, 18 Jul 2024 10:13:59 -0700 Subject: [PATCH 43/90] updating data Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 176435e6d..3315f6902 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-17-24-0' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-18-24-0' IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0' HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' From 16cb041b419c5b90a34cc78b19dc3eb0550b00b0 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Thu, 18 Jul 2024 12:16:39 -0700 Subject: [PATCH 44/90] attempt to resolve jenkins issue Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 3315f6902..a17a074ad 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-18-24-0' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-18-24-1' IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0' HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' From 5548a9536c2b7a3ba4b6dc8e631d95988d798c89 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Thu, 18 Jul 2024 12:31:48 -0700 Subject: [PATCH 45/90] ci tests resolving Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index a17a074ad..9705cdfdc 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-18-24-1' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-30-24-0' IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0' HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' From 739ef305961bbe72e293c571c9fbc4fc904412fd Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Fri, 12 Jul 2024 12:44:46 -0700 Subject: [PATCH 46/90] testing Signed-off-by: Alex Cui --- test.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test.txt diff --git a/test.txt b/test.txt new file mode 100644 index 000000000..e69de29bb From 4431f6a53c75fbf9b608c7e6a9da56889be15f32 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Fri, 12 Jul 2024 12:45:34 -0700 Subject: [PATCH 47/90] removing test.txt Signed-off-by: Alex Cui --- test.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test.txt diff --git a/test.txt b/test.txt deleted file mode 100644 index e69de29bb..000000000 From c1c7ef426b56e12137eddd913b5c67978955c1b7 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Mon, 15 Jul 2024 12:47:41 -0700 Subject: [PATCH 48/90] fixing zh tn money curreny on l Signed-off-by: Alex Cui --- .../text_normalization/zh/taggers/money.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 786319627..f5f40fb3e 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -19,7 +19,6 @@ from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.text_normalization.zh.utils import get_abs_path -# def get_quantity(decimal): suffix = pynini.union( "万", "十万", @@ -107,7 +106,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa # larger money as decimals graph_decimal = ( pynutil.insert('integer_part: \"') - + pynini.closure( + + ( pynini.closure(cardinal, 1) + pynutil.delete('.') + pynutil.insert('点') @@ -117,24 +116,23 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa ) graph_decimal_money = ( pynini.closure(graph_decimal, 1) - + pynini.closure(pynutil.insert(' quantity: \"') + suffix + pynutil.insert('\"')) + + pynini.closure((pynutil.insert(' quantity: \"') + suffix + pynutil.insert('\"')), 0, 1) + pynutil.insert(" ") + pynini.closure(currency_mandarin_component, 1) ) | ( pynini.closure(currency_component, 1) - + pynutil.insert(" ") + pynini.closure(graph_decimal, 1) - + pynini.closure(pynutil.insert(" ") + pynutil.insert('quantity: \"') + suffix + pynutil.insert('\"')) + + pynini.closure( + (pynutil.insert(" ") + pynutil.insert('quantity: \"') + suffix + pynutil.insert('\"')), 0, 1 + ) ) - graph = ( + final_graph = ( graph_regular_money | graph_units | pynutil.add_weight(graph_mandarin_money, -3.0) | pynutil.add_weight(graph_decimal_money, -1.0) ) - final_graph = graph - final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() From f1d1d967ef327cbbef22b91df953f5dae2e248ff Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Mon, 15 Jul 2024 15:09:49 -0700 Subject: [PATCH 49/90] bug fix on money currency l Signed-off-by: Alex Cui --- tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh index dd352b42b..5089427b6 100644 --- a/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh @@ -1,7 +1,7 @@ #! /bin/sh GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} -PROJECT_DIR=${2:-"/workspace/tests/en"} +PROJECT_DIR=${2:-"/workspace/tests"} runtest () { input=$1 From f520f570e88e7d516b1358e343e04c43310bf2bb Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Tue, 16 Jul 2024 09:23:10 -0700 Subject: [PATCH 50/90] resolving failed ci tests for money grammar Signed-off-by: Alex Cui --- nemo_text_processing/text_normalization/zh/taggers/money.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index f5f40fb3e..607a968dc 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -121,6 +121,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa + pynini.closure(currency_mandarin_component, 1) ) | ( pynini.closure(currency_component, 1) + + pynutil.insert(" ") + pynini.closure(graph_decimal, 1) + pynini.closure( (pynutil.insert(" ") + pynutil.insert('quantity: \"') + suffix + pynutil.insert('\"')), 0, 1 @@ -134,5 +135,5 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa | pynutil.add_weight(graph_decimal_money, -1.0) ) - final_graph = self.add_tokens(final_graph) + final_graph = self.add_tokens(graph_decimal_money) self.fst = final_graph.optimize() From 818dca08366f1970ee4a8783c4d5c1687a0e1d03 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Tue, 16 Jul 2024 17:02:45 -0700 Subject: [PATCH 51/90] updates for decimal maoney failure Signed-off-by: Alex Cui --- nemo_text_processing/text_normalization/zh/taggers/money.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 607a968dc..c48013d2c 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -135,5 +135,6 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa | pynutil.add_weight(graph_decimal_money, -1.0) ) - final_graph = self.add_tokens(graph_decimal_money) + #import pdb; pdb.set_trace() + final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() From 078fb7a0dd3cd3a276bd41d7238e2f90879e30dd Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Tue, 16 Jul 2024 17:03:05 -0700 Subject: [PATCH 52/90] removing comments Signed-off-by: Alex Cui --- .../zh/taggers/tokenize_and_classify.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py index d35ea178b..c6cd716e0 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py @@ -78,18 +78,18 @@ def __init__( punctuation = PunctuationFst(deterministic=deterministic) classify = pynini.union( - pynutil.add_weight(date.fst, 1.1), - pynutil.add_weight(fraction.fst, 1.0), - pynutil.add_weight(money.fst, 1.1), - pynutil.add_weight(measure.fst, 1.05), - pynutil.add_weight(time.fst, 1.1), - pynutil.add_weight(whitelist.fst, 1.1), - pynutil.add_weight(cardinal.fst, 1.1), - pynutil.add_weight(decimal.fst, 3.05), - pynutil.add_weight(ordinal.fst, 1.1), - pynutil.add_weight(punctuation.fst, 1.0), - pynutil.add_weight(word.fst, 100), - ) + pynutil.add_weight(date.fst, 1.1), + pynutil.add_weight(fraction.fst, 1.0), + pynutil.add_weight(money.fst, 1.1), + pynutil.add_weight(measure.fst, 1.05), + pynutil.add_weight(time.fst, 1.1), + pynutil.add_weight(whitelist.fst, 1.1), + pynutil.add_weight(cardinal.fst, 1.1), + pynutil.add_weight(decimal.fst, 3.05), + pynutil.add_weight(ordinal.fst, 1.1), + pynutil.add_weight(punctuation.fst, 1.0), + pynutil.add_weight(word.fst, 100), + ) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") tagger = pynini.closure(token, 1) From 5087cd094dad9fb44f6d3cb21c8c09022cb0c092 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 16:23:50 +0000 Subject: [PATCH 53/90] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo_text_processing/text_normalization/zh/taggers/money.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index c48013d2c..60fca3f7d 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -121,7 +121,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa + pynini.closure(currency_mandarin_component, 1) ) | ( pynini.closure(currency_component, 1) - + pynutil.insert(" ") + + pynutil.insert(" ") + pynini.closure(graph_decimal, 1) + pynini.closure( (pynutil.insert(" ") + pynutil.insert('quantity: \"') + suffix + pynutil.insert('\"')), 0, 1 From 66f27874c5815ebbf25e86a3633d075fa2640799 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Tue, 16 Jul 2024 17:55:09 -0700 Subject: [PATCH 54/90] updates on money grammar for failure cases Signed-off-by: Alex Cui --- nemo_text_processing/text_normalization/zh/taggers/money.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 60fca3f7d..9b4778994 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -135,6 +135,5 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa | pynutil.add_weight(graph_decimal_money, -1.0) ) - #import pdb; pdb.set_trace() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() From 85c6ed5beea12490b3dd657624afcfc43f431830 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Tue, 16 Jul 2024 17:55:30 -0700 Subject: [PATCH 55/90] adding test cases in the nvbug Signed-off-by: Alex Cui --- .../test_cases_word.txt | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt index 23270bf82..74b965e80 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt @@ -4,4 +4,22 @@ 只有智商超过一定数值的人才能破解~只有智商超过一定数值的人才能破解 这是由人工智能控制的系统~这是由人工智能控制的系统 欧洲旅游目的地多到不知道怎么选~欧洲旅游目的地多到不知道怎么选 -马斯科卖掉豪宅住进折叠屋~马斯科卖掉豪宅住进折叠屋 \ No newline at end of file +马斯科卖掉豪宅住进折叠屋~马斯科卖掉豪宅住进折叠屋 +免除GOOGLE在一桩诽谤官司中的法律责任。~免除GOOGLE在一桩诽谤官司中的法律责任。 +这对CHROME是有利的。~这对CHROME是有利的。 +这可能是PILde使用者。~这可能是PILde使用者。 +CSI侧重科学办案,也就是现场搜正和鉴识。~CSI侧重科学办案,也就是现场搜正和鉴识。 +我以前非常喜欢一个软体,DRAW。~我以前非常喜欢一个软体,DRAW。 +我爱你病毒。~我爱你病毒。 +微软举办了RACETOMARKETCHALLENGE竞赛。~微软举办了RACETOMARKETCHALLENGE竞赛。 +苹果销售量的复苏程度远超PC市场。~苹果销售量的复苏程度远超PC市场。 +第三季还有两款ANDROID手机亮相。~第三季还有两款ANDROID手机亮相。 +反而应试著让所有GOOGLE服务更加社交化。~反而应试著让所有GOOGLE服务更加社交化。 +GOOGLE已提供一项NATIVECLIENT软体。~GOOGLE已提供一项NATIVECLIENT软体。 +这些程式都支援PRE与ITUNES同步化。~这些程式都支援PRE与ITUNES同步化。 +可以推断此次NTT可能也会将同样的策略用在LTE上。~可以推断此次NTT可能也会将同样的策略用在LTE上。 +现今许多小型企业因成本考量被迫采用一般PC作为伺服器。~现今许多小型企业因成本考量被迫采用一般PC作为伺服器。 +部落格宣布GOOGLECHROMES的诞生。~部落格宣布GOOGLECHROMES的诞生。 +由ZIP订购机场接送或观光景点共乘服务。~由ZIP订购机场接送或观光景点共乘服务。 +PAQUE表示短时间应该还不会全面开放。~PAQUE表示短时间应该还不会全面开放。 +CBS是美国一家重要的广播电视网路公司。~CBS是美国一家重要的广播电视网路公司。 \ No newline at end of file From bea17edf37f4f2b0352899e4625026dbb59fa5f0 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Wed, 17 Jul 2024 16:40:06 -0700 Subject: [PATCH 56/90] renaming final graphs Signed-off-by: Alex Cui --- nemo_text_processing/text_normalization/zh/taggers/money.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 9b4778994..d0c6d2ca5 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -128,12 +128,12 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa ) ) - final_graph = ( + graph = ( graph_regular_money | graph_units | pynutil.add_weight(graph_mandarin_money, -3.0) | pynutil.add_weight(graph_decimal_money, -1.0) ) - final_graph = self.add_tokens(final_graph) + final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() From 5e26452aaf6764b27d86dfed227c2e82c0c48989 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Thu, 18 Jul 2024 10:04:38 -0700 Subject: [PATCH 57/90] conflicts Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 419a887ed..469429f2c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-30-24-0' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-17-24-0' IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0' HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' From a7c8b6d210c6b5e13250ae749902b7c9074d913e Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Thu, 18 Jul 2024 10:13:59 -0700 Subject: [PATCH 58/90] updating data Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 469429f2c..29d860872 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-17-24-0' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-18-24-0' IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0' HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' From 7a64e69a07322b5cc9bc15f6ccb16e156c614d7d Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Thu, 18 Jul 2024 12:16:39 -0700 Subject: [PATCH 59/90] attempt to resolve jenkins issue Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 29d860872..edce37431 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-18-24-0' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-18-24-1' IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0' HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' From 483f667886a3109a646fd53e4875a37a7caeb50d Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Thu, 18 Jul 2024 12:31:48 -0700 Subject: [PATCH 60/90] ci tests resolving Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index edce37431..419a887ed 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-18-24-1' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-30-24-0' IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0' HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' From 47016b66c866d584f051f411fb39e84c1791bd4e Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Wed, 24 Jul 2024 09:07:21 -0700 Subject: [PATCH 61/90] resolving conflict for ci tests update Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 419a887ed..367b6a449 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -476,4 +476,4 @@ pipeline { cleanWs() } } -} +} \ No newline at end of file From af3f8f7ef3eb6c309b2c5339e33b814632dc3a7c Mon Sep 17 00:00:00 2001 From: anand-nv <105917641+anand-nv@users.noreply.github.com> Date: Wed, 22 Nov 2023 04:17:25 +0530 Subject: [PATCH 62/90] Increase weights for serial (en TN) (#128) * Increase weights for serial (en TN) Resolves https://github.com/NVIDIA/NeMo-text-processing/issues/126 Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> * Add tests for fix Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> * Update Jenkinsfile cache path Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> * Update Jenkinsfile. Fix cache folder Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> --------- Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Signed-off-by: Alex Cui --- .../en/data_text_normalization/test_cases_money.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_money.txt index e2c828f42..e02209585 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_money.txt @@ -63,4 +63,4 @@ $1,925.21~one thousand nine hundred and twenty five dollars twenty one cents $1,234.123~one thousand two hundred and thirty four point one two three dollars US $76.3 trillion~US seventy six point three trillion dollars US$76.3 trillion~seventy six point three trillion us dollars -The price for each canned salmon is $5 , each bottle of peanut butter is $3~The price for each canned salmon is five dollars , each bottle of peanut butter is three dollars +The price for each canned salmon is $5 , each bottle of peanut butter is $3~The price for each canned salmon is five dollars , each bottle of peanut butter is three dollars \ No newline at end of file From 9a68cf912fbd6394ada9a6a9e724fff82708e520 Mon Sep 17 00:00:00 2001 From: Mariana <47233618+mgrafu@users.noreply.github.com> Date: Thu, 7 Dec 2023 22:17:32 -0500 Subject: [PATCH 63/90] add measures file for FR TN (#131) * add measures file Signed-off-by: Mariana Graterol Fuenmayor * update whitelist data Signed-off-by: Mariana Graterol Fuenmayor * add fr tn tests Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Alex Cui --- tests/nemo_text_processing/fr/test_whitelist.py | 2 +- tests/nemo_text_processing/fr/test_word.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/nemo_text_processing/fr/test_whitelist.py b/tests/nemo_text_processing/fr/test_whitelist.py index dac398fba..dd210b9b7 100644 --- a/tests/nemo_text_processing/fr/test_whitelist.py +++ b/tests/nemo_text_processing/fr/test_whitelist.py @@ -39,4 +39,4 @@ def test_denorm(self, test_input, expected): @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) - assert pred == expected + assert pred == expected \ No newline at end of file diff --git a/tests/nemo_text_processing/fr/test_word.py b/tests/nemo_text_processing/fr/test_word.py index 6d48db4fc..bfda32bd7 100644 --- a/tests/nemo_text_processing/fr/test_word.py +++ b/tests/nemo_text_processing/fr/test_word.py @@ -39,4 +39,4 @@ def test_denorm(self, test_input, expected): @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) - assert pred == expected + assert pred == expected \ No newline at end of file From 78aadbe3a9df38a1d915e1ec642c850373723acc Mon Sep 17 00:00:00 2001 From: anand-nv <105917641+anand-nv@users.noreply.github.com> Date: Fri, 19 Jan 2024 23:29:26 +0530 Subject: [PATCH 64/90] Sh jenkins (#127) * Add SH tests to Jenkins Signed-off-by: Anand Joseph * Update cache paths Signed-off-by: Anand Joseph * Update Jenkins tests Signed-off-by: Anand Joseph * Add CI/CD tests for sparrowhawk Signed-off-by: Anand Joseph * docker build only if in test mode Signed-off-by: Anand Joseph * Fix missing variable Signed-off-by: Anand Joseph * Fix comments and remove arguments not required Signed-off-by: Anand Joseph * Fix commands not executing Signed-off-by: Anand Joseph * Missing arguments Signed-off-by: Anand Joseph * Missing quotes Signed-off-by: Anand Joseph * Fix incorrect path for tests Signed-off-by: Anand Joseph * Fix paths Signed-off-by: Anand Joseph * Incorrect paths of tests and shunit2 Signed-off-by: Anand Joseph * Fix issues with paths as arguments to shunit Signed-off-by: Anand Joseph * Undo path change Signed-off-by: Anand Joseph * Fix intentional fail test Signed-off-by: Anand Joseph * revert redundant check for cased option Signed-off-by: Anand Joseph * Fix default path in export_grammars.sh Signed-off-by: Anand Joseph * Update cache paths Signed-off-by: Anand Joseph * Add interactive option Signed-off-by: Anand Joseph * Add SH tests for cased EN ITN Signed-off-by: Anand Joseph --------- Signed-off-by: Anand Joseph Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Signed-off-by: Alex Cui --- .../en/test_sparrowhawk_inverse_text_normalization.sh | 2 +- .../en/test_sparrowhawk_inverse_text_normalization_cased.sh | 2 +- tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh | 2 +- tools/text_processing_deployment/docker/launch.sh | 2 +- tools/text_processing_deployment/export_grammars.sh | 1 - tools/text_processing_deployment/sh_test.sh | 2 +- 6 files changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization.sh index 705f4bdaf..610cd6c20 100644 --- a/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization.sh @@ -82,4 +82,4 @@ testITNWord() { shift $# # Load shUnit2 -. /workspace/shunit2/shunit2 +. /workspace/shunit2/shunit2 \ No newline at end of file diff --git a/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization_cased.sh b/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization_cased.sh index 8c701e06a..fe622bbe7 100644 --- a/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization_cased.sh +++ b/tests/nemo_text_processing/en/test_sparrowhawk_inverse_text_normalization_cased.sh @@ -82,4 +82,4 @@ testITNWord() { shift $# # Load shUnit2 -. /workspace/shunit2/shunit2 +. /workspace/shunit2/shunit2 \ No newline at end of file diff --git a/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh index 3d5f7ae19..f15f2290d 100644 --- a/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh @@ -119,4 +119,4 @@ testTNMath() { shift $# # Load shUnit2 -. /workspace/shunit2/shunit2 +. /workspace/shunit2/shunit2 \ No newline at end of file diff --git a/tools/text_processing_deployment/docker/launch.sh b/tools/text_processing_deployment/docker/launch.sh index 1bb4c78ca..dea998f7b 100644 --- a/tools/text_processing_deployment/docker/launch.sh +++ b/tools/text_processing_deployment/docker/launch.sh @@ -57,4 +57,4 @@ docker run -it -e LANG=C.UTF-8 -e LC_ALL=C.UTF-8 --rm \ $MOUNTS \ -v $SCRIPT_DIR/../../../tests/nemo_text_processing/:/workspace/tests/ \ -w $WORK_DIR \ - sparrowhawk:latest $CMD + sparrowhawk:latest $CMD \ No newline at end of file diff --git a/tools/text_processing_deployment/export_grammars.sh b/tools/text_processing_deployment/export_grammars.sh index 82d4d4179..2e4a0b998 100644 --- a/tools/text_processing_deployment/export_grammars.sh +++ b/tools/text_processing_deployment/export_grammars.sh @@ -107,4 +107,3 @@ else echo "done mode: $MODE" exit 0 fi - diff --git a/tools/text_processing_deployment/sh_test.sh b/tools/text_processing_deployment/sh_test.sh index 32b5f9774..3e31de37c 100644 --- a/tools/text_processing_deployment/sh_test.sh +++ b/tools/text_processing_deployment/sh_test.sh @@ -63,4 +63,4 @@ VERBALIZE_FAR=${CACHE_DIR}_${GRAMMARS}_${INPUT_CASE}/verbalize/verbalize.far CONFIG=${LANGUAGE}_${GRAMMARS}_${INPUT_CASE} cp $CLASSIFY_FAR /workspace/sparrowhawk/documentation/grammars_${CONFIG}/en_toy/classify/ -cp $VERBALIZE_FAR /workspace/sparrowhawk/documentation/grammars_${CONFIG}/en_toy/verbalize/ +cp $VERBALIZE_FAR /workspace/sparrowhawk/documentation/grammars_${CONFIG}/en_toy/verbalize/ \ No newline at end of file From 4f9da16f6f965bb42e8bdf7a450ffde3d61386e8 Mon Sep 17 00:00:00 2001 From: Evelina <10428420+ekmb@users.noreply.github.com> Date: Wed, 14 Feb 2024 14:28:26 -0800 Subject: [PATCH 65/90] update isort - fix precommit (#138) * update isort version Signed-off-by: Evelina * update isort version Signed-off-by: Evelina * fix format Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove unused imports Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui --- .../ja/verbalizers/word.py | 64 +++++++++---------- .../text_normalization/ar/taggers/decimal.py | 2 +- .../en/taggers/electronic.py | 2 +- .../hu/taggers/electronic.py | 2 +- .../it/taggers/electronic.py | 2 +- .../it/verbalizers/electronic.py | 2 +- .../nemo_text_processing/mr/test_cardinal.py | 16 ++--- tests/nemo_text_processing/mr/test_date.py | 16 ++--- 8 files changed, 53 insertions(+), 53 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/word.py index 366282985..d7c2cc874 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/word.py @@ -1,32 +1,32 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pynini -from pynini.lib import pynutil - -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space - - -class WordFst(GraphFst): - ''' - tokens { char: "一" } -> 一 - ''' - - def __init__(self, deterministic: bool = True, lm: bool = False): - super().__init__(name="char", kind="verbalize", deterministic=deterministic) - - graph = pynutil.delete("name: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"") - graph = pynini.closure(delete_space) + graph + pynini.closure(delete_space) - self.fst = graph.optimize() +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class WordFst(GraphFst): + ''' + tokens { char: "一" } -> 一 + ''' + + def __init__(self, deterministic: bool = True, lm: bool = False): + super().__init__(name="char", kind="verbalize", deterministic=deterministic) + + graph = pynutil.delete("name: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"") + graph = pynini.closure(delete_space) + graph + pynini.closure(delete_space) + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/ar/taggers/decimal.py b/nemo_text_processing/text_normalization/ar/taggers/decimal.py index f276155e9..e8325214a 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/ar/taggers/decimal.py @@ -77,4 +77,4 @@ def __init__(self, cardinal: GraphFst, deterministic: bool): ) self.final_graph = self.add_tokens(self.final_graph_decimal) - self.fst = self.final_graph.optimize() + self.fst = self.final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/en/taggers/electronic.py b/nemo_text_processing/text_normalization/en/taggers/electronic.py index c3d0a1003..521f72efa 100644 --- a/nemo_text_processing/text_normalization/en/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/en/taggers/electronic.py @@ -164,4 +164,4 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): final_graph = self.add_tokens(graph) - self.fst = final_graph.optimize() + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hu/taggers/electronic.py b/nemo_text_processing/text_normalization/hu/taggers/electronic.py index 67160e99e..8dad44e15 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/hu/taggers/electronic.py @@ -112,4 +112,4 @@ def __init__(self, deterministic: bool = True): self.graph = graph final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true")) - self.fst = final_graph.optimize() + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/it/taggers/electronic.py b/nemo_text_processing/text_normalization/it/taggers/electronic.py index 2fe3e855e..ef2bb756e 100644 --- a/nemo_text_processing/text_normalization/it/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/it/taggers/electronic.py @@ -103,4 +103,4 @@ def __init__(self, deterministic: bool = True): self.graph = graph final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true")) - self.fst = final_graph.optimize() + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/it/verbalizers/electronic.py b/nemo_text_processing/text_normalization/it/verbalizers/electronic.py index f7071d3d2..0f1c87e41 100644 --- a/nemo_text_processing/text_normalization/it/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/it/verbalizers/electronic.py @@ -85,4 +85,4 @@ def add_space_after_char(): ) delete_tokens = self.delete_tokens(self.graph + delete_preserve_order) - self.fst = delete_tokens.optimize() + self.fst = delete_tokens.optimize() \ No newline at end of file diff --git a/tests/nemo_text_processing/mr/test_cardinal.py b/tests/nemo_text_processing/mr/test_cardinal.py index e7bd452fd..73ca0423e 100644 --- a/tests/nemo_text_processing/mr/test_cardinal.py +++ b/tests/nemo_text_processing/mr/test_cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,17 @@ import pytest from parameterized import parameterized -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file -class TestCardinal: - inverse_normalizer_mr = InverseNormalizer(lang='mr', cache_dir=CACHE_DIR, overwrite_cache=False) +class TestPreprocess: + normalizer_zh = Normalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') - @parameterized.expand(parse_test_case_file('mr/data_inverse_text_normalization/test_cases_cardinal.txt')) + @parameterized.expand(parse_test_case_file('zh/data_text_normalization/test_cases_preprocess.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer_mr.inverse_normalize(test_input, verbose=False) - assert pred == expected + def test_norm_preprocess(self, test_input, expected): + preds = self.normalizer_zh.normalize(test_input) + assert expected == preds diff --git a/tests/nemo_text_processing/mr/test_date.py b/tests/nemo_text_processing/mr/test_date.py index 4ad5eb74d..ed9efb231 100644 --- a/tests/nemo_text_processing/mr/test_date.py +++ b/tests/nemo_text_processing/mr/test_date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,17 @@ import pytest from parameterized import parameterized -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file -class TestDate: - inverse_normalizer_mr = InverseNormalizer(lang='mr', cache_dir=CACHE_DIR, overwrite_cache=False) +class TestMath: + normalizer_zh = Normalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') - @parameterized.expand(parse_test_case_file('mr/data_inverse_text_normalization/test_cases_date.txt')) + @parameterized.expand(parse_test_case_file('zh/data_text_normalization/test_cases_math.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer_mr.inverse_normalize(test_input, verbose=False) - assert pred == expected + def test_norm_math(self, test_input, expected): + preds = self.normalizer_zh.normalize(test_input) + assert expected == preds From 02fae02b2b22ee3a58e7da9bd784995b3906ea79 Mon Sep 17 00:00:00 2001 From: David Sargsyan <66821320+davidks13@users.noreply.github.com> Date: Thu, 15 Feb 2024 22:44:37 +0400 Subject: [PATCH 66/90] Armenian itn (#136) * Added Armenian ITN Signed-off-by: David Sargsyan * Added Armenian ITN Signed-off-by: David Sargsyan * Added Armenian ITN Signed-off-by: David Sargsyan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: David Sargsyan * Added context for tests and fixed CodeQL errors Signed-off-by: David Sargsyan * Revert "Added context for tests and fixed CodeQL errors" This reverts commit 2c804d941963c0be21d3aad07e6cd13568ab747b. Signed-off-by: David Sargsyan * Added context to some test files and fixed CodeQL errors Signed-off-by: David Sargsyan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: David Sargsyan * deleted unnecessary data Signed-off-by: David Sargsyan * translated a few measurements to Armenian Signed-off-by: David Sargsyan * adjusted some things for better readability and maintainer support Signed-off-by: David Sargsyan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed one test case and some issues Signed-off-by: David Sargsyan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: David Sargsyan Co-authored-by: David Sargsyan Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui --- .../inverse_text_normalization/inverse_normalize.py | 4 ++++ tests/nemo_text_processing/hy/test_decimal.py | 2 +- tests/nemo_text_processing/hy/test_fraction.py | 2 +- tests/nemo_text_processing/hy/test_measure.py | 2 +- tests/nemo_text_processing/hy/test_money.py | 2 +- tests/nemo_text_processing/hy/test_ordinal.py | 2 +- tests/nemo_text_processing/hy/test_time.py | 2 +- tests/nemo_text_processing/hy/test_whitelist.py | 2 +- tests/nemo_text_processing/hy/test_word.py | 2 +- tools/text_processing_deployment/pynini_export.py | 4 ++++ 10 files changed, 16 insertions(+), 8 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index c996f9f3c..3efa09386 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -170,7 +170,11 @@ def parse_args(): parser.add_argument( "--language", help="language", +<<<<<<< HEAD choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hy', 'mr', 'ja'], +======= + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hy'], +>>>>>>> d9f749e (Armenian itn (#136)) default="en", type=str, ) diff --git a/tests/nemo_text_processing/hy/test_decimal.py b/tests/nemo_text_processing/hy/test_decimal.py index aaa65a0b7..8de89408a 100644 --- a/tests/nemo_text_processing/hy/test_decimal.py +++ b/tests/nemo_text_processing/hy/test_decimal.py @@ -22,7 +22,7 @@ class TestDecimal: - + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_decimal.txt')) diff --git a/tests/nemo_text_processing/hy/test_fraction.py b/tests/nemo_text_processing/hy/test_fraction.py index c9fcc7873..1e866de30 100644 --- a/tests/nemo_text_processing/hy/test_fraction.py +++ b/tests/nemo_text_processing/hy/test_fraction.py @@ -22,7 +22,7 @@ class TestFraction: - + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=True) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_fraction.txt')) diff --git a/tests/nemo_text_processing/hy/test_measure.py b/tests/nemo_text_processing/hy/test_measure.py index 9402523c7..d66387ac5 100644 --- a/tests/nemo_text_processing/hy/test_measure.py +++ b/tests/nemo_text_processing/hy/test_measure.py @@ -22,7 +22,7 @@ class TestMeasure: - + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_measure.txt')) diff --git a/tests/nemo_text_processing/hy/test_money.py b/tests/nemo_text_processing/hy/test_money.py index 291ce764f..5e0a0f72b 100644 --- a/tests/nemo_text_processing/hy/test_money.py +++ b/tests/nemo_text_processing/hy/test_money.py @@ -22,7 +22,7 @@ class TestMoney: - + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_money.txt')) diff --git a/tests/nemo_text_processing/hy/test_ordinal.py b/tests/nemo_text_processing/hy/test_ordinal.py index 1e93f5f2e..a724969e2 100644 --- a/tests/nemo_text_processing/hy/test_ordinal.py +++ b/tests/nemo_text_processing/hy/test_ordinal.py @@ -22,7 +22,7 @@ class TestOrdinal: - + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=True) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_ordinal.txt')) diff --git a/tests/nemo_text_processing/hy/test_time.py b/tests/nemo_text_processing/hy/test_time.py index 6c0f72537..ba5f21b29 100644 --- a/tests/nemo_text_processing/hy/test_time.py +++ b/tests/nemo_text_processing/hy/test_time.py @@ -22,7 +22,7 @@ class TestTime: - + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_time.txt')) diff --git a/tests/nemo_text_processing/hy/test_whitelist.py b/tests/nemo_text_processing/hy/test_whitelist.py index 75562cf9f..b16708851 100644 --- a/tests/nemo_text_processing/hy/test_whitelist.py +++ b/tests/nemo_text_processing/hy/test_whitelist.py @@ -22,7 +22,7 @@ class TestWhitelist: - + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_whitelist.txt')) diff --git a/tests/nemo_text_processing/hy/test_word.py b/tests/nemo_text_processing/hy/test_word.py index 30f7274b1..ea69ea32a 100644 --- a/tests/nemo_text_processing/hy/test_word.py +++ b/tests/nemo_text_processing/hy/test_word.py @@ -22,7 +22,7 @@ class TestWord: - + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_word.txt')) diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index c7607ca17..6fbd61392 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -86,7 +86,11 @@ def parse_args(): parser.add_argument( "--language", help="language", +<<<<<<< HEAD choices=["en", "de", "es", "pt", "ru", 'fr', 'hu', 'sv', 'vi', 'zh', 'ar', 'it', 'es_en', 'hy', 'mr', 'ja'], +======= + choices=["en", "de", "es", "pt", "ru", 'fr', 'hu', 'sv', 'vi', 'zh', 'ar', 'it', 'es_en', 'hy'], +>>>>>>> d9f749e (Armenian itn (#136)) type=str, default='en', ) From e9f32a85857f6ea09596626b93d8b35e5177ea03 Mon Sep 17 00:00:00 2001 From: Evelina <10428420+ekmb@users.noreply.github.com> Date: Thu, 29 Feb 2024 08:43:12 -0800 Subject: [PATCH 67/90] Fix CI (#142) * fix whitelist deployment Signed-off-by: Evelina * clean up Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * comment out tests to recreate grammars Signed-off-by: Evelina * shorten test Signed-off-by: Evelina * fix jenkins Signed-off-by: Evelina * cased for TN Signed-off-by: Evelina * revert debug changes Signed-off-by: Evelina * fix args default Signed-off-by: Evelina * try parallel Signed-off-by: Evelina * debug parallel Signed-off-by: Evelina * rerun Signed-off-by: Evelina * rerun Signed-off-by: Evelina * fix sh tests for local SH launcher Signed-off-by: Evelina * enable all ci tests Signed-off-by: Evelina * enable all ci tests Signed-off-by: Evelina --------- Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- .../en/data_text_normalization/test_cases_money.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 367b6a449..e9ef51e94 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -190,7 +190,7 @@ pipeline { } } - stage('L0: Create RU TN/ITN Grammars & SV & PT') { + stage('L0: Create RU TN/ITN Grammars & SV & PT & ZH') { when { anyOf { branch 'main' diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_money.txt index e02209585..e2c828f42 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_money.txt @@ -63,4 +63,4 @@ $1,925.21~one thousand nine hundred and twenty five dollars twenty one cents $1,234.123~one thousand two hundred and thirty four point one two three dollars US $76.3 trillion~US seventy six point three trillion dollars US$76.3 trillion~seventy six point three trillion us dollars -The price for each canned salmon is $5 , each bottle of peanut butter is $3~The price for each canned salmon is five dollars , each bottle of peanut butter is three dollars \ No newline at end of file +The price for each canned salmon is $5 , each bottle of peanut butter is $3~The price for each canned salmon is five dollars , each bottle of peanut butter is three dollars From f0fd38a6577479671e0b0ae95039a9567e638b21 Mon Sep 17 00:00:00 2001 From: David Sargsyan <66821320+davidks13@users.noreply.github.com> Date: Wed, 13 Mar 2024 04:23:47 +0400 Subject: [PATCH 68/90] Armenian TN (#137) * merged with main branch and fixed conflicts Signed-off-by: David Sargsyan * fixing conflicts Signed-off-by: David Sargsyan * fixing some more conflicts Signed-off-by: David Sargsyan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: David Sargsyan * fixed a minor issue Signed-off-by: David Sargsyan * deleted unused imports Signed-off-by: David Sargsyan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix: add "hy" language option for armenian Signed-off-by: Ara Yeroyan <60027241+Ara-Yeroyan@users.noreply.github.com> * added optional space for measurements after cardinals/decimals Signed-off-by: David Sargsyan * added Armenian dot Signed-off-by: David Sargsyan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: David Sargsyan Signed-off-by: Ara Yeroyan <60027241+Ara-Yeroyan@users.noreply.github.com> Signed-off-by: tbartley94 <90423858+tbartley94@users.noreply.github.com> Co-authored-by: David Sargsyan Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Ara Yeroyan <60027241+Ara-Yeroyan@users.noreply.github.com> Co-authored-by: tbartley94 <90423858+tbartley94@users.noreply.github.com> Signed-off-by: Alex Cui --- tools/text_processing_deployment/pynini_export.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 6fbd61392..4e8626384 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -270,6 +270,10 @@ def parse_args(): from nemo_text_processing.inverse_text_normalization.ja.verbalizers.verbalize import ( VerbalizeFst as ITNVerbalizeFst, ) + from nemo_text_processing.text_normalization.hy.taggers.tokenize_and_classify import ( + ClassifyFst as TNClassifyFst, + ) + from nemo_text_processing.text_normalization.hy.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") export_grammars( output_dir=output_dir, From b0d58dde3cca030a1b32b91f953859eefd4aa98b Mon Sep 17 00:00:00 2001 From: Chinmay Patil <72211393+ChinmayPatil11@users.noreply.github.com> Date: Wed, 13 Mar 2024 22:49:29 +0530 Subject: [PATCH 69/90] Marathi ITN (#134) * Added Marathi ITN Signed-off-by: Chinmay Patil * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * adding jenkins test Signed-off-by: Travis Bartley --------- Signed-off-by: Chinmay Patil Signed-off-by: tbartley94 <90423858+tbartley94@users.noreply.github.com> Signed-off-by: Travis Bartley Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: tbartley94 <90423858+tbartley94@users.noreply.github.com> Co-authored-by: Travis Bartley Signed-off-by: Alex Cui --- Jenkinsfile | 27 +++++++++++++++++++ .../inverse_normalize.py | 4 --- .../nemo_text_processing/mr/test_cardinal.py | 12 ++++++++- tests/nemo_text_processing/mr/test_date.py | 12 ++++++++- .../pynini_export.py | 4 --- 5 files changed, 49 insertions(+), 10 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index e9ef51e94..73408ef64 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -296,6 +296,33 @@ pipeline { } } + stage('L0: Create HY TN/ITN Grammars & MR') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('L0: MR ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=mr --text="शून्य" --cache_dir ${MR_ITN_CACHE}' + } + } + stage('L0: HY TN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hy --text="6" --cache_dir ${HY_TN_CACHE}' + } + } + stage('L0: HY ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hy --text="վեց" --cache_dir ${HY_TN_CACHE}' + } + } + } + } + // L1 Tests starts here diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index 3efa09386..c996f9f3c 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -170,11 +170,7 @@ def parse_args(): parser.add_argument( "--language", help="language", -<<<<<<< HEAD choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hy', 'mr', 'ja'], -======= - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hy'], ->>>>>>> d9f749e (Armenian itn (#136)) default="en", type=str, ) diff --git a/tests/nemo_text_processing/mr/test_cardinal.py b/tests/nemo_text_processing/mr/test_cardinal.py index 73ca0423e..7fe575599 100644 --- a/tests/nemo_text_processing/mr/test_cardinal.py +++ b/tests/nemo_text_processing/mr/test_cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ import pytest from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -29,3 +30,12 @@ class TestPreprocess: def test_norm_preprocess(self, test_input, expected): preds = self.normalizer_zh.normalize(test_input) assert expected == preds + + inverse_normalizer_mr = InverseNormalizer(lang='mr', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('mr/data_inverse_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_mr.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/mr/test_date.py b/tests/nemo_text_processing/mr/test_date.py index ed9efb231..8fa51305a 100644 --- a/tests/nemo_text_processing/mr/test_date.py +++ b/tests/nemo_text_processing/mr/test_date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ from parameterized import parameterized from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -29,3 +30,12 @@ class TestMath: def test_norm_math(self, test_input, expected): preds = self.normalizer_zh.normalize(test_input) assert expected == preds +class TestDate: + inverse_normalizer_mr = InverseNormalizer(lang='mr', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('mr/data_inverse_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_mr.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 4e8626384..d6ceb84f2 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -86,11 +86,7 @@ def parse_args(): parser.add_argument( "--language", help="language", -<<<<<<< HEAD choices=["en", "de", "es", "pt", "ru", 'fr', 'hu', 'sv', 'vi', 'zh', 'ar', 'it', 'es_en', 'hy', 'mr', 'ja'], -======= - choices=["en", "de", "es", "pt", "ru", 'fr', 'hu', 'sv', 'vi', 'zh', 'ar', 'it', 'es_en', 'hy'], ->>>>>>> d9f749e (Armenian itn (#136)) type=str, default='en', ) From a514d806ec6ea453ad3d3de346db793561300267 Mon Sep 17 00:00:00 2001 From: tbartley94 <90423858+tbartley94@users.noreply.github.com> Date: Wed, 13 Mar 2024 13:42:41 -0700 Subject: [PATCH 70/90] jenkins fix (#150) * jenkins fix Signed-off-by: Travis Bartley * removing armenian to troubleshoot jenkins Signed-off-by: Travis Bartley * removing armenian to troubleshoot jenkins Signed-off-by: Travis Bartley * missing _init_ for python Signed-off-by: Travis Bartley * mislabled cache Signed-off-by: Travis Bartley --------- Signed-off-by: Travis Bartley Signed-off-by: Alex Cui --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 73408ef64..fdc65626f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -307,7 +307,7 @@ pipeline { parallel { stage('L0: MR ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=mr --text="शून्य" --cache_dir ${MR_ITN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=mr --text="शून्य " --cache_dir ${MR_TN_CACHE}' } } stage('L0: HY TN grammars') { @@ -317,7 +317,7 @@ pipeline { } stage('L0: HY ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hy --text="վեց" --cache_dir ${HY_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hy --text="վեց " --cache_dir ${HY_TN_CACHE}' } } } From 66cda82691a2363f525c92e9b0cf2e10dc3e9e4d Mon Sep 17 00:00:00 2001 From: "Buyuan(Alex) Cui" <69030297+BuyuanCui@users.noreply.github.com> Date: Tue, 30 Apr 2024 13:10:05 -0700 Subject: [PATCH 71/90] ZH sentence-level TN (#112) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Swedish telephone fix (#60) * port fix for telephone from swedish-itn branch Signed-off-by: Jim O'Regan * extend cardinal in non-deterministic mode Signed-off-by: Jim O'Regan * whitespace fixes Signed-off-by: Jim O'Regan * also fix in the verbaliser Signed-off-by: Jim O'Regan * Update Jenkinsfile Signed-off-by: Jim O’Regan --------- Signed-off-by: Jim O'Regan Signed-off-by: Jim O’Regan Signed-off-by: Alex Cui * log instead of print in graph_utils.py (#68) Signed-off-by: Enno Hermann Signed-off-by: Alex Cui * CER estimation speedup for audio-based text normalization (#73) * Replaced jiwer with editdistance to speed up CER estimation Signed-off-by: Vitaly Lavrukhin * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Vitaly Lavrukhin Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * add measure coverage for TN and ITN (#62) * add measure coverage for TN and ITN Signed-off-by: ealbasiri * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove unused imports Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove unused imports Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> * Remove unused imports Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update measure.py Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> --------- Signed-off-by: ealbasiri Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Signed-off-by: Alex Cui * upload es-ES, es-LA, fr-FR and it-IT g2p dicts (#63) * upload es-ES and fr-FR g2p dicts Signed-off-by: Mariana Graterol Fuenmayor * add inits Signed-off-by: Mariana Graterol Fuenmayor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add NALA Spanish dict Signed-off-by: Mariana Graterol Fuenmayor * rename Spanish and French dictionaries Signed-off-by: Mariana Graterol Fuenmayor * add Italian dictionary Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * add country codes from hu (#77) Signed-off-by: Jim O'Regan Signed-off-by: Alex Cui * fix electronic case for username (#75) * fix electronic username w/o . Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * disable sv tests Signed-off-by: Evelina * disable sv tests Signed-off-by: Evelina * fix ar test Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * disable sv tests Signed-off-by: Evelina * update ci dirs, enable sv tests Signed-off-by: Evelina --------- Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * 0.1.8 release (#79) Signed-off-by: Evelina Signed-off-by: Alex Cui * Codeswitched ES/EN ITN (#78) * Initial commit for ES-EN codeswitched ITN Signed-off-by: Anand Joseph * Enable export for es_en codeswitched ITN Signed-off-by: Anand Joseph * Add whitelist, update weights Signed-off-by: Anand Joseph * Add tests for en_es, zone tagged separately in es Signed-off-by: Anand Joseph * Fix path to test data for sparrowhawk tests Signed-off-by: Anand Joseph * Update Jenkinsfile - enable ES/EN tests Signed-off-by: Anand Joseph * Add __init__.py files Signed-off-by: Anand Joseph * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix issues with failed docker build - due to archiving of debian and issues with re2 Signed-off-by: Anand Joseph * Remove unused imports and variables Signed-off-by: Anand Joseph * Update date Signed-off-by: Anand Joseph * Enable NBSP in sparrowhawk tests Signed-off-by: Anand Joseph * Update copyrights Signed-off-by: Anand Joseph * Update cache path in for ES/EN CI/CD Signed-off-by: Anand Joseph --------- Signed-off-by: Anand Joseph Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * electronic verbalizer fallback (#81) * 0.1.8 release Signed-off-by: Evelina * add elec fallback Signed-off-by: Evelina * update ci Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * minor normalize.py edit for usability (#84) * electronic verbalizer fallback (#81) * 0.1.8 release Signed-off-by: Evelina * add elec fallback Signed-off-by: Evelina * update ci Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Linnea Pari Leaver * documentation edits for grammar/clarity Signed-off-by: Linnea Pari Leaver * added --output_field flag for command line interface Signed-off-by: Linnea Pari Leaver * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Evelina Signed-off-by: Linnea Pari Leaver Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Linnea Pari Leaver Signed-off-by: Alex Cui * Swedish ITN (#40) * force two digits for month Signed-off-by: Jim O'Regan * put it in a function, because I reject the garbage pre-commit.ci came up with Signed-off-by: Jim O'Regan * wrap some more pieces Signed-off-by: Jim O'Regan * add graph pieces Signed-off-by: Jim O'Regan * delete junk Signed-off-by: Jim O'Regan * my copyright Signed-off-by: Jim O'Regan * add date verbaliser (copy from es) Signed-off-by: Jim O'Regan * tweaks Signed-off-by: Jim O'Regan * add date verbaliser Signed-off-by: Jim O'Regan * add right tokens Signed-off-by: Jim O'Regan * some tweaks, more needed Signed-off-by: Jim O'Regan * basic test cases Signed-off-by: Jim O'Regan * tweaks to TN date tagger Signed-off-by: Jim O'Regan * tweaks to ITN date tagger Signed-off-by: Jim O'Regan * tweaks to TN date tagger Signed-off-by: Jim O'Regan * remove duplicate Signed-off-by: Jim O'Regan * moved to tagger Signed-off-by: Jim O'Regan * nothing actually fixed here Signed-off-by: Jim O'Regan * now most tests pass Signed-off-by: Jim O'Regan * electronic Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fractions Signed-off-by: Jim O'Regan * extend Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * bare fractions is a bit of an overreach Signed-off-by: Jim O'Regan * whitelist Signed-off-by: Jim O'Regan * just inverting the TN whitelist tagger will not work/be useful Signed-off-by: Jim O'Regan * copy from English Signed-off-by: Jim O'Regan * overwrite with version from en Signed-off-by: Jim O'Regan * add basic test case Signed-off-by: Jim O'Regan * fix call Signed-off-by: Jim O'Regan * swap tsv sides Signed-off-by: Jim O'Regan * remove unused imports Signed-off-by: Jim O'Regan * add optional_era variable Signed-off-by: Jim O'Regan * add test case Signed-off-by: Jim O'Regan * make deterministic default, like most of the others Signed-off-by: Jim O'Regan * also add lowercase versions Signed-off-by: Jim O'Regan * replacing NEMO_SPACE does not work either Signed-off-by: Jim O'Regan * increasing weight... did not work last time Signed-off-by: Jim O'Regan * tweaking test cases, in case it was a sentence splitting issue. It was not Signed-off-by: Jim O'Regan * put the full stops back Signed-off-by: Jim O'Regan * add filler words Signed-off-by: Jim O'Regan * try splitting this out to see if it makes a difference Signed-off-by: Jim O'Regan * aha, this part should be non-deterministic only Signed-off-by: Jim O'Regan * single line only Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Revert "increasing weight... did not work last time" This reverts commit 39b020b50db745dfd6b281c8cbca45a033926996. Signed-off-by: Jim O'Regan * disabling ITN here makes TN work again(?) Signed-off-by: Jim O'Regan * Revert "disabling ITN here makes TN work again(?)" This reverts commit be49d7d5c687876e51c2e9ce1cf1e01491df280f. Signed-off-by: Jim O'Regan * changing the variable name fixes norm tests Signed-off-by: Jim O'Regan * change the variable names Signed-off-by: Jim O'Regan * add missing test tooling Signed-off-by: Jim O'Regan * copy telephone fixes from hu Signed-off-by: Jim O'Regan * copy telephone fixes from hu Signed-off-by: Jim O'Regan * add a piece for area codes for ITN Signed-off-by: Jim O'Regan * add country codes from hu Signed-off-by: Jim O'Regan * extend any_read_digit for ITN Signed-off-by: Jim O'Regan * country/area codes for ITN Signed-off-by: Jim O'Regan * first attempt Signed-off-by: Jim O'Regan * add to t&c Signed-off-by: Jim O'Regan * add to t&c Signed-off-by: Jim O'Regan * remove country codes for the time being, makes things ambiguous Signed-off-by: Jim O'Regan * basic test cases Signed-off-by: Jim O'Regan * fix Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove trailing whitespace Signed-off-by: Jim O'Regan * Update __init__.py Signed-off-by: Jim O’Regan * fix comment Signed-off-by: Jim O'Regan * fix comment Signed-off-by: Jim O'Regan * basic transform of TN tests Signed-off-by: Jim O'Regan * basic transformation of TN decimal tests Signed-off-by: Jim O'Regan * slight changes to date Signed-off-by: Jim O'Regan * tweak Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * include space Signed-off-by: Jim O'Regan * problem with tusen Signed-off-by: Jim O'Regan * problem with tusen was not that Signed-off-by: Jim O'Regan * add functions from hu Signed-off-by: Jim O'Regan * respect my own copyright xD Signed-off-by: Jim O'Regan * move data loading to constructor; had weirdness in this file, probably due to module-level python-suckage Signed-off-by: Jim O'Regan * move data loading, this has been an oddity before Signed-off-by: Jim O'Regan * try changing this year declaration Signed-off-by: Jim O'Regan * add year + era Signed-off-by: Jim O'Regan * eliminate more module-level data loading Signed-off-by: Jim O'Regan * Revert "eliminate more module-level data loading" This reverts commit 6a26e5d927817e1308e818758196924441ff7b3a. Signed-off-by: Jim O'Regan * expose variables Signed-off-by: Jim O'Regan * extra param for itn mode Signed-off-by: Jim O'Regan * change call Signed-off-by: Jim O'Regan * change comment Signed-off-by: Jim O'Regan * change comment Signed-off-by: Jim O'Regan * move data loading Signed-off-by: Jim O'Regan * fix parens Signed-off-by: Jim O'Regan * move data loading Signed-off-by: Jim O'Regan * adapt comments Signed-off-by: Jim O'Regan * adapt comments Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * adapt/extend tests Signed-off-by: Jim O'Regan * fix dict init/change keys to something useful Signed-off-by: Jim O'Regan * initial stab at prefixed numbers Signed-off-by: Jim O'Regan * some adapting Signed-off-by: Jim O'Regan * insert kl. if absent Signed-off-by: Jim O'Regan * fix comments Signed-off-by: Jim O'Regan * the relative prefixed times Signed-off-by: Jim O'Regan * + comments Signed-off-by: Jim O'Regan * enable time Signed-off-by: Jim O'Regan * space in both directions Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix comment Signed-off-by: Jim O'Regan * fix hours to Signed-off-by: Jim O'Regan * split by before/after Signed-off-by: Jim O'Regan * delete, not insert Signed-off-by: Jim O'Regan * fix if Signed-off-by: Jim O'Regan * kl. 9 Signed-off-by: Jim O'Regan * copy from en Signed-off-by: Jim O'Regan * keep only get_abs_path Signed-off-by: Jim O'Regan * imports Signed-off-by: Jim O'Regan * add trimmed file Signed-off-by: Jim O'Regan * fix imports Signed-off-by: Jim O'Regan * two abs_paths... could be fun Signed-off-by: Jim O'Regan * minutes/seconds Signed-off-by: Jim O'Regan * suffix Signed-off-by: Jim O'Regan * delete, not insert Signed-off-by: Jim O'Regan * one optional Signed-off-by: Jim O'Regan * export variable Signed-off-by: Jim O'Regan * kl. or one of suffix/zone Signed-off-by: Jim O'Regan * already disambiguated Signed-off-by: Jim O'Regan * closure Signed-off-by: Jim O'Regan * do not insert kl. Signed-off-by: Jim O'Regan * fix test case Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix spelling Signed-off-by: Jim O'Regan * Delete measure.py Signed-off-by: Jim O’Regan * Delete money.py Signed-off-by: Jim O’Regan * remove unused pieces Signed-off-by: Jim O'Regan * remove unused pieces Signed-off-by: Jim O'Regan * remove unused test pieces Signed-off-by: Jim O'Regan * copy from es Signed-off-by: Jim O'Regan * add SV ITN Signed-off-by: Jim O'Regan * add/update __init__ Signed-off-by: Jim O'Regan * blank line Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix comment Signed-off-by: Jim O'Regan * fix lang Signed-off-by: Jim O'Regan * fix decimal verbaliser Signed-off-by: Jim O'Regan * fix Signed-off-by: Jim O'Regan * remove year, conflicts with cardinal Signed-off-by: Jim O'Regan * space before, not after Signed-off-by: Jim O'Regan * fix cardinal tests Signed-off-by: Jim O'Regan * spurious deletion Signed-off-by: Jim O'Regan * fix comment Signed-off-by: Jim O'Regan * unused imports Signed-off-by: Jim O'Regan * re-enable SV TN; enable SV ITN Signed-off-by: Jim O'Regan * Revert "re-enable SV TN; enable SV ITN" This reverts commit 3ce4dfde1f70a89afc274284f6e4c737b3fac95b. Signed-off-by: Jim O'Regan * fix singulras Signed-off-by: Jim O'Regan * add an export Signed-off-by: Jim O'Regan * change integer graph Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * move spaces Signed-off-by: Jim O'Regan * use cdrewrite Signed-off-by: Jim O'Regan * just EOS/BOS Signed-off-by: Jim O'Regan * fix typo Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Jim O'Regan * omit en/ett, because they are also articles Signed-off-by: Jim O'Regan * uncomment Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * unused Signed-off-by: Jim O'Regan * strip spaces from decimal part Signed-off-by: Jim O'Regan * export Signed-off-by: Jim O'Regan * partial fix, not what I wanted Signed-off-by: Jim O'Regan * move comment Signed-off-by: Jim O'Regan * en/ett cannot work in itn case Signed-off-by: Jim O'Regan * be more deliberate in graph construction Signed-off-by: Jim O'Regan * accept both Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * +2 tests Signed-off-by: Jim O'Regan * (try to) accept singular quantities for plurals Signed-off-by: Jim O'Regan * retry Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * oops Signed-off-by: Jim O'Regan * replace Signed-off-by: Jim O'Regan * arcmap Signed-off-by: Jim O'Regan * version without ones Signed-off-by: Jim O'Regan * add another test Signed-off-by: Jim O'Regan * change graph Signed-off-by: Jim O'Regan * simplify Signed-off-by: Jim O'Regan * get rid of this, this is where it goes wrong Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * more tests Signed-off-by: Jim O'Regan * add a test Signed-off-by: Jim O'Regan * multiple states from both ones, try removing and readding Signed-off-by: Jim O'Regan * remove ones, see if that fixes at least the bare quantities Signed-off-by: Jim O'Regan * works in the repl, dunno why it still breaks Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove duplicate Signed-off-by: Jim O'Regan * move definition Signed-off-by: Jim O'Regan * simplify Signed-off-by: Jim O'Regan * tweak Signed-off-by: Jim O'Regan * another test Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * local declaration, seems to not be working Signed-off-by: Jim O'Regan * more tests Signed-off-by: Jim O'Regan * match verbaliser Signed-off-by: Jim O'Regan * fix last two failing tests Signed-off-by: Jim O'Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add missing tests for telephone and word Signed-off-by: Jim O'Regan * remove unused variable Signed-off-by: Jim O'Regan * remove unused imports Signed-off-by: Jim O'Regan * fix comment Signed-off-by: Jim O'Regan * get rid of convert_space, tests fail Signed-off-by: Jim O'Regan * put convert_spaces back, change test file; pytest fails Signed-off-by: Jim O'Regan * Revert "put convert_spaces back, change test file; pytest fails" This reverts commit a7bb7489137b8026aab02aff64df39e874630043. Signed-off-by: Jim O'Regan * put convert_spaces back, change test file; pytest fails, take 2 Signed-off-by: Jim O'Regan * deliberately remove spaces rather than have a non-determinism that comes out differently in sparrowhawk Signed-off-by: Jim O'Regan * try converting the non-breaking spaces in the shell script Signed-off-by: Jim O'Regan * wrong place Signed-off-by: Jim O'Regan * fix typo Signed-off-by: Jim O'Regan * fix path Signed-off-by: Jim O'Regan * export Signed-off-by: Jim O'Regan * export Signed-off-by: Jim O'Regan * remove unused Signed-off-by: Jim O'Regan * Update date.py Signed-off-by: Jim O’Regan * Update time.py Signed-off-by: Jim O’Regan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix comment Signed-off-by: Jim O’Regan * trim comments Signed-off-by: Jim O’Regan * remove commented line Signed-off-by: Jim O’Regan * en halv Signed-off-by: Jim O’Regan * Update test_sparrowhawk_inverse_text_normalization.sh Signed-off-by: Jim O’Regan --------- Signed-off-by: Jim O'Regan Signed-off-by: Jim O’Regan Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * Italian_TN (#67) * add TN italian Signed-off-by: GiacomoLeoneMaria * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix init Signed-off-by: GiacomoLeoneMaria * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix LOCATION Signed-off-by: GiacomoLeoneMaria * modify graph_utils Signed-off-by: GiacomoLeoneMaria * correct decimals Signed-off-by: GiacomoLeoneMaria * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix electronic Signed-off-by: Giacomo Cavallini * fix electronic Signed-off-by: Giacomo Cavallini * fix measure Signed-off-by: Giacomo Cavallini --------- Signed-off-by: GiacomoLeoneMaria Signed-off-by: Giacomo Cavallini Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Signed-off-by: Alex Cui * Zh itn (#74) * Add ZH ITN Signed-off-by: Anand Joseph * Fix copyrights and code cleanup Signed-off-by: Anand Joseph * Remove invalid tests Signed-off-by: Anand Joseph * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Resolve CodeQL issues Signed-off-by: Anand Joseph * Cleanup Signed-off-by: Anand Joseph * Fix missing 'zh' option for ITN and correct comment Signed-off-by: Anand Joseph * Update __init__.py Change to zh instead of en for the imports. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update for decimal test data Signed-off-by: BuyuanCui * update for langauge import Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update for Chinese punctuations Signed-off-by: BuyuanCui * a new class for whitelist Signed-off-by: BuyuanCui * PYNINI_AVAILABLE = False Signed-off-by: BuyuanCui * recreated due to file import format issue Signed-off-by: BuyuanCui * recreated due to format issue Signed-off-by: BuyuanCui * caught duplicates, removed Signed-off-by: BuyuanCui * removed duplicates, arranges for CHInese Yuan updates Signed-off-by: BuyuanCui * updates accordingly to the comments from last PR. Recreated some of the files due to format issues Signed-off-by: BuyuanCui * removed the hours_to and minute_to files used for back counting. ALso removed am and pm suffix files according to the last PR. Recreated some of them for format issue Signed-off-by: BuyuanCui * re-added this file to avoid data file import error Signed-off-by: BuyuanCui * updated gramamr according to last PR. Removed the acceptance of 千 Signed-off-by: BuyuanCui * updates Signed-off-by: BuyuanCui * updated according to last PR. Removed comma after decimal points Signed-off-by: BuyuanCui * gramamr for Fraction Signed-off-by: BuyuanCui * gramamr for money and updated according to last PR. Plus process of 元 Signed-off-by: BuyuanCui * ordinal grammar. updates due to the updates in cardinal grammar Signed-off-by: BuyuanCui * updated accordingly to last PR comments. removing am and pm and allowing simple mandarin expression Signed-off-by: BuyuanCui * arrangements Signed-off-by: BuyuanCui * added whitelist grammar Signed-off-by: BuyuanCui * word grammar for non-classified items Signed-off-by: BuyuanCui * updated cardinal, decimal, time, itn data Signed-off-by: BuyuanCui * updates according to last PR Signed-off-by: BuyuanCui * updates according to the updates for cardinal grammar Signed-off-by: BuyuanCui * updates for more Mandarin punctuations Signed-off-by: BuyuanCui * updated accordingly to last PR. removing am pm Signed-off-by: BuyuanCui * adjustment on the weight Signed-off-by: BuyuanCui * updated accordingly to the targger updates Signed-off-by: BuyuanCui * updated accordingly to the time tagger Signed-off-by: BuyuanCui * updates according to changes in tagger on am and pm Signed-off-by: BuyuanCui * verbalizer for fraction Signed-off-by: BuyuanCui * added for mandarin grammar Signed-off-by: BuyuanCui * kept this file because using English utils results in data namin error Signed-off-by: BuyuanCui * merge conflict Signed-off-by: BuyuanCui * removed unsed imports Signed-off-by: BuyuanCui * deleted unsed import os Signed-off-by: BuyuanCui * deleted unsed variables Signed-off-by: BuyuanCui * removed unsed imports Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updates and edits based on pr checks Signed-off-by: BuyuanCui * updates and edits based on pr checks Signed-off-by: BuyuanCui * format issue, reccreated Signed-off-by: BuyuanCui * format issue recreated Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed codeing style/format Signed-off-by: BuyuanCui * fixed coding style and format Signed-off-by: BuyuanCui * removed duplicated graph for 毛 Signed-off-by: BuyuanCui * removed the comment Signed-off-by: BuyuanCui * removed the comment Signed-off-by: BuyuanCui * removing unnecessary comments Signed-off-by: BuyuanCui * unnecessary comment removed Signed-off-by: BuyuanCui * test file updated for more cases Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated with a comment explaining why this file is kept Signed-off-by: BuyuanCui * updated the file explaining why this file is kept Signed-off-by: BuyuanCui * added Mandarin as zh Signed-off-by: BuyuanCui * removing for dplication Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removed unused NEMO objects Signed-off-by: BuyuanCui * removed duplicates Signed-off-by: BuyuanCui * removing unsed imports Signed-off-by: BuyuanCui * updates to fix test file failures Signed-off-by: BuyuanCui * updates to fix file failtures Signed-off-by: BuyuanCui * updates to resolve test case failture Signed-off-by: BuyuanCui * updates to resolve test case failure Signed-off-by: BuyuanCui * updates to resolve test case failure Signed-off-by: BuyuanCui * updates to resolve test case failure Signed-off-by: BuyuanCui * updates to adap to cardinal grammar changes Signed-off-by: BuyuanCui * updates to adapt to grammar changes Signed-off-by: BuyuanCui * updates to adopt to cardinal grammar changes Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix style Signed-off-by: BuyuanCui * fix style Signed-off-by: BuyuanCui * fix style Signed-off-by: BuyuanCui * fix style Signed-off-by: BuyuanCui * fixing pr checks Signed-off-by: BuyuanCui * removed // for zhtn/itn cache Signed-off-by: BuyuanCui * Update inverse_normalize.py Added zh as a selection to pass Jenkins checks. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> --------- Signed-off-by: Anand Joseph Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> Signed-off-by: BuyuanCui Co-authored-by: Alex Cui Co-authored-by: Anand Joseph Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * updated pynini_export.py file to create far files (#88) Signed-off-by: BuyuanCui Signed-off-by: Alex Cui * readd Swedish (#87) Signed-off-by: Jim O'Regan Signed-off-by: Alex Cui * Zh tn 0712 (#89) * updates Signed-off-by: BuyuanCui * updates and fixings according to document on natonal gideline Signed-off-by: BuyuanCui * Decimal grammar added Signed-off-by: BuyuanCui * fraction updated Signed-off-by: BuyuanCui * money updated Signed-off-by: BuyuanCui * ordinal grammar added Signed-off-by: BuyuanCui * punctuation grammar added Signed-off-by: BuyuanCui * time gramamr updated Signed-off-by: BuyuanCui * tokenizaer updated Signed-off-by: BuyuanCui * updates on certificate Signed-off-by: BuyuanCui * data updated and added due to updates and chanegs to the existing grammar Signed-off-by: BuyuanCui * cardinal updated Signed-off-by: BuyuanCui * date grammar changed Signed-off-by: BuyuanCui * decimal grammar added Signed-off-by: BuyuanCui * grammar updated Signed-off-by: BuyuanCui * grammar updated Signed-off-by: BuyuanCui * grammar added Signed-off-by: BuyuanCui * grammar updates Signed-off-by: BuyuanCui * test data added Signed-off-by: BuyuanCui * test python file edits Signed-off-by: BuyuanCui * updates for tn1.0 and previous tn grammar from contribution Signed-off-by: BuyuanCui * test cases updated Signed-off-by: BuyuanCui * coding style fixed Signed-off-by: BuyuanCui * dates updated for init files Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated the date for zh Signed-off-by: BuyuanCui * removed unsed imports Signed-off-by: BuyuanCui * removed comments Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added back the itn tests Signed-off-by: BuyuanCui * added back measure and math from previou TN Signed-off-by: BuyuanCui * updated for tests reruns Signed-off-by: BuyuanCui * updats Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated weights Signed-off-by: BuyuanCui --------- Signed-off-by: BuyuanCui Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * Zh tn char (#95) * file name change Signed-off-by: BuyuanCui * file name change Signed-off-by: BuyuanCui * file name change Signed-off-by: BuyuanCui * file name change Signed-off-by: BuyuanCui * file name change Signed-off-by: BuyuanCui * file name Signed-off-by: BuyuanCui * file name Signed-off-by: BuyuanCui * file name Signed-off-by: BuyuanCui * file name Signed-off-by: BuyuanCui * file name Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * code stle Signed-off-by: BuyuanCui * fixed import error Signed-off-by: BuyuanCui --------- Signed-off-by: BuyuanCui Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * audio-based TN fix for empty pred_text/text (#92) * fix for empty pred_text Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add unittests Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix path Signed-off-by: Evelina * fix path Signed-off-by: Evelina * fix pytest Signed-off-by: Evelina --------- Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * pip 1.2.0 Signed-off-by: Evelina Signed-off-by: Alex Cui * French tn (#91) * add tests for fr tn Signed-off-by: Mariana Graterol Fuenmayor * add fr tn for cardinals, decimals, fractions and ordinals Signed-off-by: Mariana Graterol Fuenmayor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * delete it far files from tools Signed-off-by: Mariana Graterol Fuenmayor * add languages to run_evaluate Signed-off-by: Mariana Graterol Fuenmayor * remove ambiguous spacing Signed-off-by: Mariana Graterol Fuenmayor * enable sh testing for fr tn Signed-off-by: Mariana Graterol Fuenmayor * fix bug with ordinals Signed-off-by: Mariana Graterol Fuenmayor * update jenkinsfile cache date Signed-off-by: Mariana Graterol Fuenmayor * fix test for ordinals Signed-off-by: Mariana Graterol Fuenmayor * update tn cache for fr Signed-off-by: Mariana Graterol Fuenmayor * resolve codeql issues Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * Add whitelist_tech.tsv (#96) Signed-off-by: Anand Joseph Signed-off-by: Alex Cui * Zhitn 0727 (#93) * updates on itn grammar to pass sparrowhawk tests Signed-off-by: BuyuanCui * updats for sparrowhawk tests Signed-off-by: BuyuanCui * updates fro sparrowhawk tests Signed-off-by: BuyuanCui * coding style fix Signed-off-by: BuyuanCui * updates for coding style and sparrowhawk test Signed-off-by: BuyuanCui * updated classes for tests on whitelist and word grammar Signed-off-by: BuyuanCui * added for tests on whitelist Signed-off-by: BuyuanCui * added for test on word Signed-off-by: BuyuanCui * added to run test on whitelist Signed-off-by: BuyuanCui * added to run test on word Signed-off-by: BuyuanCui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update test_word.py Removed unused import. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_word.py Removed imports according to CodeQL Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_whitelist.py Removing imports according to CodeQL Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_whitelist.py Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update Jenkinsfile changed zh cache to 07-27-23 as it is the latest update. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> --------- Signed-off-by: BuyuanCui Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * Es tn romans fix (#98) * fix es tn roman exceptions Signed-off-by: Mariana Graterol Fuenmayor * update jenkinsfile Signed-off-by: Mariana Graterol Fuenmayor * update eval script for ITN Signed-off-by: Mariana Graterol Fuenmayor * codeql fix Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Alex Cui * Change docker image (#102) Change docker image to one including sparrowhawk Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Signed-off-by: Alex Cui * Print warning instead exception (#97) * raise text Signed-off-by: Nikolay Karpov * text arg Signed-off-by: Nikolay Karpov * Failed text Signed-off-by: Nikolay Karpov * add logger Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rm raise Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * logger Signed-off-by: Nikolay Karpov * NeMo-text-processing Signed-off-by: Nikolay Karpov * info level Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rm raise Signed-off-by: Nikolay Karpov * verbose Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Normalizer.select_verbalizer Signed-off-by: Nikolay Karpov * Exception Signed-off-by: Nikolay Karpov * verbose Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * restart ci Signed-off-by: Evelina --------- Signed-off-by: Nikolay Karpov Signed-off-by: Nikolay Karpov Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Nikolay Karpov Co-authored-by: Evelina Signed-off-by: Alex Cui * warning regardless of verbose flag (#107) * warning Signed-off-by: Nikolay Karpov * self.verbose Signed-off-by: Nikolay Karpov --------- Signed-off-by: Nikolay Karpov Signed-off-by: Alex Cui * Unpin setuptools (#106) Signed-off-by: Peter Plantinga Signed-off-by: Alex Cui * fixed warnings: File is not always closes. (#113) Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com> Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com> Signed-off-by: Alex Cui * fix bug #111 (ar currencies) (#117) * fix bug #111 (ar currencies) Signed-off-by: Mariana Graterol Fuenmayor * update ci folder Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Alex Cui * Logging clean up + IT TN fix (#118) * fix utils and it TN Signed-off-by: Evelina * clean up Signed-off-by: Evelina * fix logging Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix format Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix format Signed-off-by: Evelina * fix format Signed-off-by: Evelina * add IT TN to CI Signed-off-by: Evelina * update patch Signed-off-by: Evelina --------- Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * Time_IT_TN (#105) * add time verbalizer Signed-off-by: GiacomoLeoneMaria * add time tagger and verba Signed-off-by: GiacomoLeoneMaria * add pytest time Signed-off-by: GiacomoLeoneMaria * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * codeQL Signed-off-by: GiacomoLeoneMaria * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix numbers with eight Signed-off-by: GiacomoLeoneMaria --------- Signed-off-by: GiacomoLeoneMaria Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui * IT TN improvement on tests (#120) * add missing test cases Signed-off-by: Mariana Graterol Fuenmayor * fix bug with time tests Signed-off-by: Mariana Graterol Fuenmayor * update ci date Signed-off-by: Mariana Graterol Fuenmayor * add sentence test cases Signed-off-by: Mariana Graterol Fuenmayor * refine shortest path for irregular cardinals Signed-off-by: Mariana Graterol Fuenmayor * update ci date Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Alex Cui * add single letter exception for roman numerals (#121) * add single letter exception for roman numerals Signed-off-by: Mariana Graterol Fuenmayor * update ci dir Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Alex Cui * rewrote tokenizer Signed-off-by: BuyuanCui Signed-off-by: Alex Cui * removed the file and replaced it with char in 1.8 Signed-off-by: BuyuanCui Signed-off-by: Alex Cui * jenkins file update Signed-off-by: BuyuanCui Signed-off-by: Alex Cui * to fix tn bug@ xuesong Signed-off-by: BuyuanCui Signed-off-by: Alex Cui * tn bug Signed-off-by: BuyuanCui Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Alex Cui * fixeds and updates Signed-off-by: BuyuanCui Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Alex Cui * adjustments Signed-off-by: BuyuanCui Signed-off-by: Alex Cui * testing commit Signed-off-by: Alex Cui * removing unsed file Signed-off-by: Alex Cui * updated test cases Signed-off-by: Alex Cui * updating etst cases Signed-off-by: Alex Cui * updates adapting to graphs Signed-off-by: Alex Cui * updated cases for SH tests Signed-off-by: Alex Cui * updated cases Signed-off-by: Alex Cui * added some sentences Signed-off-by: Alex Cui * test cases update Signed-off-by: Alex Cui * solving rebase issue, repushing changes Signed-off-by: Alex Cui * resolving conflict Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixings according to ci Signed-off-by: Alex Cui * fixings according to the ci Signed-off-by: Alex Cui * removed not used Signed-off-by: Alex Cui * notused removing Signed-off-by: Alex Cui * format issue Signed-off-by: Alex Cui * formt issue Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removing unused files Signed-off-by: Alex Cui * removing unused files Signed-off-by: Alex Cui * remiving unsed files; Signed-off-by: Alex Cui * removing unsed files Signed-off-by: Alex Cui * removing unsed files Signed-off-by: Alex Cui * added sentences as test cases Signed-off-by: Alex Cui * added senetnces as test cases Signed-off-by: Alex Cui * removed commentyed out tests Signed-off-by: Alex Cui * updating dates Signed-off-by: Alex Cui * attemps to fix bug Signed-off-by: Alex Cui * inprocess of fixing the bug Signed-off-by: Alex Cui * fixing existing issue Signed-off-by: Alex Cui * updated graph_utils, tokenize and classify, and word graphs Signed-off-by: Alex Cui * added bacl the ppostprocessor far creation Signed-off-by: Alex Cui * updated NEMO_NOT_ALPHA as a new variable Signed-off-by: Alex Cui * far files Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * combiedn into measure Signed-off-by: Alex Cui * removing and combined to meaasure Signed-off-by: Alex Cui * removing, not used Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updates to fix space issue Signed-off-by: Alex Cui * updates to fix space issue Signed-off-by: Alex Cui * updates to fix space issue Signed-off-by: Alex Cui * updates to solve the space issue Signed-off-by: Alex Cui * resolving sh issue Signed-off-by: Alex Cui * resolving sh test issue Signed-off-by: Alex Cui * adding anands updates Signed-off-by: Alex Cui * data updated for measure and whitelist Signed-off-by: Alex Cui * updates Signed-off-by: Alex Cui * updates Signed-off-by: Alex Cui * updates Signed-off-by: Alex Cui * removing fraction and math part Signed-off-by: Alex Cui * removing comments Signed-off-by: Alex Cui * removing preprocessor, updating measure, adding shitelist cases Signed-off-by: Alex Cui * removing processor, modification for sp test, shitelist and word Signed-off-by: Alex Cui * updating zh date Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * realized itn being cvommented out, adding back Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * trying to run zh tn separately because it takes long time to run Signed-off-by: Alex Cui * modification to ru zh tn separately Signed-off-by: Alex Cui * independent zh tnitn tests for more time Signed-off-by: Alex Cui * adding lines to save far file Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updates for reducing testing time Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * for ounct graph Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removing used graphs Signed-off-by: Alex Cui * format and removing used comments Signed-off-by: Alex Cui * removing this one, not used Signed-off-by: Alex Cui * remove unused commentss Signed-off-by: Alex Cui * removing unsed comments Signed-off-by: Alex Cui * removing unsed comments Signed-off-by: Alex Cui * removing comments Signed-off-by: Alex Cui * Delete tools/text_processing_deployment/zh directory Removing far files. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * updates according to the github comments Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removing comments Signed-off-by: Alex Cui * punct grammar Signed-off-by: Alex Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update test_cases_cardinal.txt Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update Dockerfile Copied from main branch ( which included Anand's updates) Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update launch.sh Found differences in the file. Fixing it back. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_word.py Saw word ITN being commented out. Adding it back. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update money.py Found cardinal grammar not accepting suffix. Fixed it. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update Jenkinsfile Removed duplicated zh test from line 230s Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update utils.py Addressing bug raised in bug in graph_utils.py of zh ITN and decimal tagger of ar TN #162. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update graph_utils.py Addressing bug in graph_utils.py of zh ITN and decimal tagger of ar TN #162. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update measure.py Fixing code style, removing unused imports Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update word.py Fixing code style, removing unused imports Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update measure.py Removing unused import. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update post_processing.py Removing unused imports Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update post_processing.py Removing unused import Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update word.py Removing unused imports Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update cardinal.py Deleting unused graph Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update word.py Removing import pynini Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update word.py removing pynini import Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update verbalize.py removing pynutil import Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update post_processing.py removing punct graph imported Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_sparrowhawk_normalization.sh Update on test issue for Docker file locations Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_ordinal.py Fixing style. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Delete nemo_text_processing/text_normalization/zh/taggers/math_symbol.py Removing because it's not one of the semiotic classes. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Delete nemo_text_processing/text_normalization/zh/verbalizers/math_symbol.py Removing because it's not one of the semiotic classes. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update Jenkinsfile Updating Jenkins date Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> --------- Signed-off-by: Jim O'Regan Signed-off-by: Jim O’Regan Signed-off-by: Alex Cui Signed-off-by: Enno Hermann Signed-off-by: Vitaly Lavrukhin Signed-off-by: ealbasiri Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Evelina Signed-off-by: Anand Joseph Signed-off-by: Linnea Pari Leaver Signed-off-by: GiacomoLeoneMaria Signed-off-by: Giacomo Cavallini Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> Signed-off-by: BuyuanCui Signed-off-by: Nikolay Karpov Signed-off-by: Nikolay Karpov Signed-off-by: Peter Plantinga Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com> Co-authored-by: Jim O’Regan Co-authored-by: Enno Hermann Co-authored-by: Vitaly Lavrukhin Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Enas Albasiri <71229149+ealbasiri@users.noreply.github.com> Co-authored-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com> Co-authored-by: lleaver <137942999+lleaver@users.noreply.github.com> Co-authored-by: Linnea Pari Leaver Co-authored-by: Jim O’Regan Co-authored-by: Giacomo Leone Maria Cavallini <72698188+GiacomoLeoneMaria@users.noreply.github.com> Co-authored-by: Alex Cui Co-authored-by: Anand Joseph Co-authored-by: Evelina Co-authored-by: Nikolay Karpov Co-authored-by: Nikolay Karpov Co-authored-by: Peter Plantinga Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com> Signed-off-by: Alex Cui --- Jenkinsfile | 26 ++++++++++++++++++- .../text_normalization/zh/taggers/money.py | 1 + .../test_cases_word.txt | 2 +- .../docker/launch.sh | 2 +- 4 files changed, 28 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index fdc65626f..273a805be 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -190,7 +190,7 @@ pipeline { } } - stage('L0: Create RU TN/ITN Grammars & SV & PT & ZH') { + stage('L0: Create RU TN/ITN Grammars & SV & PT') { when { anyOf { branch 'main' @@ -229,6 +229,7 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=pt --text="dez " --cache_dir ${PT_TN_CACHE}' } } +<<<<<<< HEAD } } @@ -293,6 +294,8 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ja --text="100" --cache_dir ${JA_TN_CACHE}' } } +======= +>>>>>>> 36fa3af (ZH sentence-level TN (#112)) } } @@ -322,6 +325,27 @@ pipeline { } } } + stage('L0: Create ZH TN/ITN Grammar') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('L0: ZH ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=zh --text="你" --cache_dir ${ZH_TN_CACHE}' + } + } + stage('L0: ZH TN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=zh --text="6" --cache_dir ${ZH_TN_CACHE}' + } + } + } + } // L1 Tests starts here diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index d0c6d2ca5..7850a6489 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -19,6 +19,7 @@ from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.text_normalization.zh.utils import get_abs_path +# def get_quantity(decimal): suffix = pynini.union( "万", "十万", diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt index 74b965e80..81ef6cdb3 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt @@ -22,4 +22,4 @@ GOOGLE已提供一项NATIVECLIENT软体。~GOOGLE已提供一项NATIVECLIENT软 部落格宣布GOOGLECHROMES的诞生。~部落格宣布GOOGLECHROMES的诞生。 由ZIP订购机场接送或观光景点共乘服务。~由ZIP订购机场接送或观光景点共乘服务。 PAQUE表示短时间应该还不会全面开放。~PAQUE表示短时间应该还不会全面开放。 -CBS是美国一家重要的广播电视网路公司。~CBS是美国一家重要的广播电视网路公司。 \ No newline at end of file +CBS是美国一家重要的广播电视网路公司。~CBS是美国一家重要的广播电视网路公司。 diff --git a/tools/text_processing_deployment/docker/launch.sh b/tools/text_processing_deployment/docker/launch.sh index dea998f7b..1bb4c78ca 100644 --- a/tools/text_processing_deployment/docker/launch.sh +++ b/tools/text_processing_deployment/docker/launch.sh @@ -57,4 +57,4 @@ docker run -it -e LANG=C.UTF-8 -e LC_ALL=C.UTF-8 --rm \ $MOUNTS \ -v $SCRIPT_DIR/../../../tests/nemo_text_processing/:/workspace/tests/ \ -w $WORK_DIR \ - sparrowhawk:latest $CMD \ No newline at end of file + sparrowhawk:latest $CMD From 128753d270ab6045d40bd1231b555eba60078cf7 Mon Sep 17 00:00:00 2001 From: Mariana <47233618+mgrafu@users.noreply.github.com> Date: Fri, 7 Jun 2024 12:32:30 -0400 Subject: [PATCH 72/90] Tts en tech terms (#167) * update tts whitelist Signed-off-by: Mariana Graterol Fuenmayor * enable normalization of emphasized input Signed-off-by: Mariana Graterol Fuenmayor * add whitelist terms Signed-off-by: Mariana Graterol Fuenmayor * add test for emphasis Signed-off-by: Mariana Graterol Fuenmayor * read card numbers as digits Signed-off-by: Mariana Graterol Fuenmayor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * make ccs deterministic Signed-off-by: Mariana Graterol Fuenmayor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update jenkins Signed-off-by: Mariana Graterol Fuenmayor * fix sh tests bug Signed-off-by: Mariana Graterol Fuenmayor * fix bug with time Signed-off-by: Mariana Graterol Fuenmayor * update jenkins Signed-off-by: Mariana Graterol Fuenmayor * fix sh time bug Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Alex Cui --- .../text_normalization/en/taggers/electronic.py | 14 ++++++++++++++ .../test_cases_electronic.txt | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/en/taggers/electronic.py b/nemo_text_processing/text_normalization/en/taggers/electronic.py index 521f72efa..c79472da1 100644 --- a/nemo_text_processing/text_normalization/en/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/en/taggers/electronic.py @@ -65,6 +65,8 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT,) + cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT) + accepted_symbols = pynini.project(pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input") accepted_common_domains = pynini.project( pynini.string_file(get_abs_path("data/electronic/domain.tsv")), "input" @@ -162,6 +164,18 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ) graph |= cc_phrases + if deterministic: + # credit card cues + numbers = pynini.closure(NEMO_DIGIT, 4, 16) + cc_phrases = ( + pynutil.insert("protocol: \"") + + cc_cues + + pynutil.insert("\" domain: \"") + + numbers + + pynutil.insert("\"") + ) + graph |= cc_phrases + final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() \ No newline at end of file diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt index 05831ad1c..12e540a98 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt @@ -40,4 +40,4 @@ enterprise-services@nvidia.com~enterprise dash services at NVIDIA dot com https://www.nvidia.com/dgx-basepod/~HTTPS colon slash slash WWW dot NVIDIA dot com slash DGX dash BASEPOD slash i can use your card ending in 8876~i can use your card ending in eight eight seven six here is mail.nasa.gov.~here is mail dot nasa dot gov. -check us out at some_university.edu.~check us out at some underscore university dot edu. \ No newline at end of file +check us out at some_university.edu.~check us out at some underscore university dot edu. From 1d37427b65ed45c2f1c7e7bc2a06315a271c61b4 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Fri, 12 Jul 2024 12:44:46 -0700 Subject: [PATCH 73/90] testing Signed-off-by: Alex Cui --- test.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test.txt diff --git a/test.txt b/test.txt new file mode 100644 index 000000000..e69de29bb From 150d86423996491095c14afe9f6c927b290aafb7 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Fri, 12 Jul 2024 12:45:34 -0700 Subject: [PATCH 74/90] removing test.txt Signed-off-by: Alex Cui --- test.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test.txt diff --git a/test.txt b/test.txt deleted file mode 100644 index e69de29bb..000000000 From 7d0b513ce51087e3cae4265841bda3fed2d001fe Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Mon, 15 Jul 2024 12:47:41 -0700 Subject: [PATCH 75/90] fixing zh tn money curreny on l Signed-off-by: Alex Cui --- nemo_text_processing/text_normalization/zh/taggers/money.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 7850a6489..f5f40fb3e 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -19,7 +19,6 @@ from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.text_normalization.zh.utils import get_abs_path -# def get_quantity(decimal): suffix = pynini.union( "万", "十万", @@ -122,19 +121,18 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa + pynini.closure(currency_mandarin_component, 1) ) | ( pynini.closure(currency_component, 1) - + pynutil.insert(" ") + pynini.closure(graph_decimal, 1) + pynini.closure( (pynutil.insert(" ") + pynutil.insert('quantity: \"') + suffix + pynutil.insert('\"')), 0, 1 ) ) - graph = ( + final_graph = ( graph_regular_money | graph_units | pynutil.add_weight(graph_mandarin_money, -3.0) | pynutil.add_weight(graph_decimal_money, -1.0) ) - final_graph = self.add_tokens(graph) + final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() From a1272f45e52a297872fe7c36d8d2b28d6c0f15f4 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Tue, 16 Jul 2024 09:23:10 -0700 Subject: [PATCH 76/90] resolving failed ci tests for money grammar Signed-off-by: Alex Cui --- nemo_text_processing/text_normalization/zh/taggers/money.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index f5f40fb3e..607a968dc 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -121,6 +121,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa + pynini.closure(currency_mandarin_component, 1) ) | ( pynini.closure(currency_component, 1) + + pynutil.insert(" ") + pynini.closure(graph_decimal, 1) + pynini.closure( (pynutil.insert(" ") + pynutil.insert('quantity: \"') + suffix + pynutil.insert('\"')), 0, 1 @@ -134,5 +135,5 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa | pynutil.add_weight(graph_decimal_money, -1.0) ) - final_graph = self.add_tokens(final_graph) + final_graph = self.add_tokens(graph_decimal_money) self.fst = final_graph.optimize() From 6314efb79747fc0a0f25a72126995bd556c54a5f Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Tue, 16 Jul 2024 17:02:45 -0700 Subject: [PATCH 77/90] updates for decimal maoney failure Signed-off-by: Alex Cui --- nemo_text_processing/text_normalization/zh/taggers/money.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 607a968dc..c48013d2c 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -135,5 +135,6 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa | pynutil.add_weight(graph_decimal_money, -1.0) ) - final_graph = self.add_tokens(graph_decimal_money) + #import pdb; pdb.set_trace() + final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() From 61b892bca441157ae66f856bf20d806c5f5a8b1f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 16:23:50 +0000 Subject: [PATCH 78/90] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo_text_processing/text_normalization/zh/taggers/money.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index c48013d2c..60fca3f7d 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -121,7 +121,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa + pynini.closure(currency_mandarin_component, 1) ) | ( pynini.closure(currency_component, 1) - + pynutil.insert(" ") + + pynutil.insert(" ") + pynini.closure(graph_decimal, 1) + pynini.closure( (pynutil.insert(" ") + pynutil.insert('quantity: \"') + suffix + pynutil.insert('\"')), 0, 1 From f8fb143b8ae537c1ccbe7f4bf3ee646aade79c60 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Tue, 16 Jul 2024 17:55:09 -0700 Subject: [PATCH 79/90] updates on money grammar for failure cases Signed-off-by: Alex Cui --- nemo_text_processing/text_normalization/zh/taggers/money.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 60fca3f7d..9b4778994 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -135,6 +135,5 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa | pynutil.add_weight(graph_decimal_money, -1.0) ) - #import pdb; pdb.set_trace() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() From 553d32ef912c394cf7ed6573fc63e93382499cc7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 17 Jul 2024 00:05:30 +0000 Subject: [PATCH 80/90] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../zh/taggers/tokenize_and_classify.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py index c6cd716e0..d35ea178b 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py @@ -78,18 +78,18 @@ def __init__( punctuation = PunctuationFst(deterministic=deterministic) classify = pynini.union( - pynutil.add_weight(date.fst, 1.1), - pynutil.add_weight(fraction.fst, 1.0), - pynutil.add_weight(money.fst, 1.1), - pynutil.add_weight(measure.fst, 1.05), - pynutil.add_weight(time.fst, 1.1), - pynutil.add_weight(whitelist.fst, 1.1), - pynutil.add_weight(cardinal.fst, 1.1), - pynutil.add_weight(decimal.fst, 3.05), - pynutil.add_weight(ordinal.fst, 1.1), - pynutil.add_weight(punctuation.fst, 1.0), - pynutil.add_weight(word.fst, 100), - ) + pynutil.add_weight(date.fst, 1.1), + pynutil.add_weight(fraction.fst, 1.0), + pynutil.add_weight(money.fst, 1.1), + pynutil.add_weight(measure.fst, 1.05), + pynutil.add_weight(time.fst, 1.1), + pynutil.add_weight(whitelist.fst, 1.1), + pynutil.add_weight(cardinal.fst, 1.1), + pynutil.add_weight(decimal.fst, 3.05), + pynutil.add_weight(ordinal.fst, 1.1), + pynutil.add_weight(punctuation.fst, 1.0), + pynutil.add_weight(word.fst, 100), + ) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") tagger = pynini.closure(token, 1) From 8002a50b95a49d195a3ed6d17cede60f5fa2a158 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Wed, 17 Jul 2024 16:40:06 -0700 Subject: [PATCH 81/90] renaming final graphs Signed-off-by: Alex Cui --- nemo_text_processing/text_normalization/zh/taggers/money.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 9b4778994..d0c6d2ca5 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -128,12 +128,12 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa ) ) - final_graph = ( + graph = ( graph_regular_money | graph_units | pynutil.add_weight(graph_mandarin_money, -3.0) | pynutil.add_weight(graph_decimal_money, -1.0) ) - final_graph = self.add_tokens(final_graph) + final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() From 7b938dec7ee883474716efd6fa18a862eb807674 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Thu, 18 Jul 2024 10:04:38 -0700 Subject: [PATCH 82/90] conflicts Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 273a805be..33581b6be 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-30-24-0' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-17-24-0' IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0' HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' From 0eca73d6e80a530a6bac296d12d7f9c05f10b959 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Thu, 18 Jul 2024 10:13:59 -0700 Subject: [PATCH 83/90] updating data Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 33581b6be..d76ababef 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-17-24-0' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-18-24-0' IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0' HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' From 8e3a793e9200728328767ebe229d98e7a87039fe Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Thu, 18 Jul 2024 12:16:39 -0700 Subject: [PATCH 84/90] attempt to resolve jenkins issue Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index d76ababef..d2181937f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-18-24-0' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-18-24-1' IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0' HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' From 2a48144bbe594d27bb7957c072c821c8d4b215e0 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Thu, 18 Jul 2024 12:31:48 -0700 Subject: [PATCH 85/90] ci tests resolving Signed-off-by: Alex Cui --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index d2181937f..273a805be 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-18-24-1' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-30-24-0' IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0' HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' From 6351b49bd845219c3bab65075c23163324bf51ef Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Jul 2024 23:33:11 +0000 Subject: [PATCH 86/90] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo_text_processing/text_normalization/ar/taggers/decimal.py | 2 +- .../text_normalization/en/taggers/electronic.py | 2 +- .../text_normalization/hu/taggers/electronic.py | 2 +- .../text_normalization/it/taggers/electronic.py | 2 +- .../text_normalization/it/verbalizers/electronic.py | 2 +- tests/nemo_text_processing/fr/test_whitelist.py | 2 +- tests/nemo_text_processing/fr/test_word.py | 2 +- tests/nemo_text_processing/hy/test_decimal.py | 2 +- tests/nemo_text_processing/hy/test_fraction.py | 2 +- tests/nemo_text_processing/hy/test_measure.py | 2 +- tests/nemo_text_processing/hy/test_money.py | 2 +- tests/nemo_text_processing/hy/test_ordinal.py | 2 +- tests/nemo_text_processing/hy/test_time.py | 2 +- tests/nemo_text_processing/hy/test_whitelist.py | 2 +- tests/nemo_text_processing/hy/test_word.py | 2 +- tests/nemo_text_processing/mr/test_date.py | 4 +++- 16 files changed, 18 insertions(+), 16 deletions(-) diff --git a/nemo_text_processing/text_normalization/ar/taggers/decimal.py b/nemo_text_processing/text_normalization/ar/taggers/decimal.py index e8325214a..f276155e9 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/ar/taggers/decimal.py @@ -77,4 +77,4 @@ def __init__(self, cardinal: GraphFst, deterministic: bool): ) self.final_graph = self.add_tokens(self.final_graph_decimal) - self.fst = self.final_graph.optimize() \ No newline at end of file + self.fst = self.final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/en/taggers/electronic.py b/nemo_text_processing/text_normalization/en/taggers/electronic.py index c79472da1..32e6d03e1 100644 --- a/nemo_text_processing/text_normalization/en/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/en/taggers/electronic.py @@ -178,4 +178,4 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): final_graph = self.add_tokens(graph) - self.fst = final_graph.optimize() \ No newline at end of file + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hu/taggers/electronic.py b/nemo_text_processing/text_normalization/hu/taggers/electronic.py index 8dad44e15..67160e99e 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/hu/taggers/electronic.py @@ -112,4 +112,4 @@ def __init__(self, deterministic: bool = True): self.graph = graph final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true")) - self.fst = final_graph.optimize() \ No newline at end of file + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/it/taggers/electronic.py b/nemo_text_processing/text_normalization/it/taggers/electronic.py index ef2bb756e..2fe3e855e 100644 --- a/nemo_text_processing/text_normalization/it/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/it/taggers/electronic.py @@ -103,4 +103,4 @@ def __init__(self, deterministic: bool = True): self.graph = graph final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true")) - self.fst = final_graph.optimize() \ No newline at end of file + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/it/verbalizers/electronic.py b/nemo_text_processing/text_normalization/it/verbalizers/electronic.py index 0f1c87e41..f7071d3d2 100644 --- a/nemo_text_processing/text_normalization/it/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/it/verbalizers/electronic.py @@ -85,4 +85,4 @@ def add_space_after_char(): ) delete_tokens = self.delete_tokens(self.graph + delete_preserve_order) - self.fst = delete_tokens.optimize() \ No newline at end of file + self.fst = delete_tokens.optimize() diff --git a/tests/nemo_text_processing/fr/test_whitelist.py b/tests/nemo_text_processing/fr/test_whitelist.py index dd210b9b7..dac398fba 100644 --- a/tests/nemo_text_processing/fr/test_whitelist.py +++ b/tests/nemo_text_processing/fr/test_whitelist.py @@ -39,4 +39,4 @@ def test_denorm(self, test_input, expected): @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) - assert pred == expected \ No newline at end of file + assert pred == expected diff --git a/tests/nemo_text_processing/fr/test_word.py b/tests/nemo_text_processing/fr/test_word.py index bfda32bd7..6d48db4fc 100644 --- a/tests/nemo_text_processing/fr/test_word.py +++ b/tests/nemo_text_processing/fr/test_word.py @@ -39,4 +39,4 @@ def test_denorm(self, test_input, expected): @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) - assert pred == expected \ No newline at end of file + assert pred == expected diff --git a/tests/nemo_text_processing/hy/test_decimal.py b/tests/nemo_text_processing/hy/test_decimal.py index 8de89408a..aaa65a0b7 100644 --- a/tests/nemo_text_processing/hy/test_decimal.py +++ b/tests/nemo_text_processing/hy/test_decimal.py @@ -22,7 +22,7 @@ class TestDecimal: - + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_decimal.txt')) diff --git a/tests/nemo_text_processing/hy/test_fraction.py b/tests/nemo_text_processing/hy/test_fraction.py index 1e866de30..c9fcc7873 100644 --- a/tests/nemo_text_processing/hy/test_fraction.py +++ b/tests/nemo_text_processing/hy/test_fraction.py @@ -22,7 +22,7 @@ class TestFraction: - + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=True) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_fraction.txt')) diff --git a/tests/nemo_text_processing/hy/test_measure.py b/tests/nemo_text_processing/hy/test_measure.py index d66387ac5..9402523c7 100644 --- a/tests/nemo_text_processing/hy/test_measure.py +++ b/tests/nemo_text_processing/hy/test_measure.py @@ -22,7 +22,7 @@ class TestMeasure: - + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_measure.txt')) diff --git a/tests/nemo_text_processing/hy/test_money.py b/tests/nemo_text_processing/hy/test_money.py index 5e0a0f72b..291ce764f 100644 --- a/tests/nemo_text_processing/hy/test_money.py +++ b/tests/nemo_text_processing/hy/test_money.py @@ -22,7 +22,7 @@ class TestMoney: - + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_money.txt')) diff --git a/tests/nemo_text_processing/hy/test_ordinal.py b/tests/nemo_text_processing/hy/test_ordinal.py index a724969e2..1e93f5f2e 100644 --- a/tests/nemo_text_processing/hy/test_ordinal.py +++ b/tests/nemo_text_processing/hy/test_ordinal.py @@ -22,7 +22,7 @@ class TestOrdinal: - + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=True) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_ordinal.txt')) diff --git a/tests/nemo_text_processing/hy/test_time.py b/tests/nemo_text_processing/hy/test_time.py index ba5f21b29..6c0f72537 100644 --- a/tests/nemo_text_processing/hy/test_time.py +++ b/tests/nemo_text_processing/hy/test_time.py @@ -22,7 +22,7 @@ class TestTime: - + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_time.txt')) diff --git a/tests/nemo_text_processing/hy/test_whitelist.py b/tests/nemo_text_processing/hy/test_whitelist.py index b16708851..75562cf9f 100644 --- a/tests/nemo_text_processing/hy/test_whitelist.py +++ b/tests/nemo_text_processing/hy/test_whitelist.py @@ -22,7 +22,7 @@ class TestWhitelist: - + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_whitelist.txt')) diff --git a/tests/nemo_text_processing/hy/test_word.py b/tests/nemo_text_processing/hy/test_word.py index ea69ea32a..30f7274b1 100644 --- a/tests/nemo_text_processing/hy/test_word.py +++ b/tests/nemo_text_processing/hy/test_word.py @@ -22,7 +22,7 @@ class TestWord: - + inverse_normalizer = InverseNormalizer(lang='hy', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hy/data_inverse_text_normalization/test_cases_word.txt')) diff --git a/tests/nemo_text_processing/mr/test_date.py b/tests/nemo_text_processing/mr/test_date.py index 8fa51305a..537103bea 100644 --- a/tests/nemo_text_processing/mr/test_date.py +++ b/tests/nemo_text_processing/mr/test_date.py @@ -15,8 +15,8 @@ import pytest from parameterized import parameterized -from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -30,6 +30,8 @@ class TestMath: def test_norm_math(self, test_input, expected): preds = self.normalizer_zh.normalize(test_input) assert expected == preds + + class TestDate: inverse_normalizer_mr = InverseNormalizer(lang='mr', cache_dir=CACHE_DIR, overwrite_cache=False) From 49d0058f37937dd331d88187788a514055101686 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Fri, 26 Jul 2024 08:50:52 -0700 Subject: [PATCH 87/90] resolving conflict Signed-off-by: Alex Cui --- Jenkinsfile | 3 --- 1 file changed, 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 273a805be..24a46e1d8 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -229,7 +229,6 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=pt --text="dez " --cache_dir ${PT_TN_CACHE}' } } -<<<<<<< HEAD } } @@ -294,8 +293,6 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ja --text="100" --cache_dir ${JA_TN_CACHE}' } } -======= ->>>>>>> 36fa3af (ZH sentence-level TN (#112)) } } From ade0b91aaab41daf2364348170ee709e31692b1a Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Fri, 26 Jul 2024 11:13:32 -0700 Subject: [PATCH 88/90] Jenkins test not starting, copied form main branch Signed-off-by: Alex Cui --- Jenkinsfile | 48 ------------------------------------------------ 1 file changed, 48 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 24a46e1d8..367b6a449 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -296,54 +296,6 @@ pipeline { } } - stage('L0: Create HY TN/ITN Grammars & MR') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('L0: MR ITN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=mr --text="शून्य " --cache_dir ${MR_TN_CACHE}' - } - } - stage('L0: HY TN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hy --text="6" --cache_dir ${HY_TN_CACHE}' - } - } - stage('L0: HY ITN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hy --text="վեց " --cache_dir ${HY_TN_CACHE}' - } - } - } - } - stage('L0: Create ZH TN/ITN Grammar') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('L0: ZH ITN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=zh --text="你" --cache_dir ${ZH_TN_CACHE}' - } - } - stage('L0: ZH TN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=zh --text="6" --cache_dir ${ZH_TN_CACHE}' - } - } - } - } - // L1 Tests starts here From 4e149fab92cb0c8dee803709808cec6f1e8ef7e8 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Mon, 29 Jul 2024 14:26:12 -0700 Subject: [PATCH 89/90] copied from Nemo main, esolving Jenkins isue Signed-off-by: Alex Cui --- tests/nemo_text_processing/mr/test_cardinal.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/nemo_text_processing/mr/test_cardinal.py b/tests/nemo_text_processing/mr/test_cardinal.py index 7fe575599..588b7ebba 100644 --- a/tests/nemo_text_processing/mr/test_cardinal.py +++ b/tests/nemo_text_processing/mr/test_cardinal.py @@ -22,14 +22,6 @@ class TestPreprocess: - normalizer_zh = Normalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') - - @parameterized.expand(parse_test_case_file('zh/data_text_normalization/test_cases_preprocess.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_norm_preprocess(self, test_input, expected): - preds = self.normalizer_zh.normalize(test_input) - assert expected == preds inverse_normalizer_mr = InverseNormalizer(lang='mr', cache_dir=CACHE_DIR, overwrite_cache=False) From 17ccdaa367768ae6df4dbc5d463aae4c3155cb79 Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Mon, 29 Jul 2024 14:26:37 -0700 Subject: [PATCH 90/90] copied from NeMo main, resolving Jenkins issue Signed-off-by: Alex Cui --- tests/nemo_text_processing/mr/test_date.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/tests/nemo_text_processing/mr/test_date.py b/tests/nemo_text_processing/mr/test_date.py index 537103bea..b58c1e532 100644 --- a/tests/nemo_text_processing/mr/test_date.py +++ b/tests/nemo_text_processing/mr/test_date.py @@ -21,17 +21,6 @@ from ..utils import CACHE_DIR, parse_test_case_file -class TestMath: - normalizer_zh = Normalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') - - @parameterized.expand(parse_test_case_file('zh/data_text_normalization/test_cases_math.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_norm_math(self, test_input, expected): - preds = self.normalizer_zh.normalize(test_input) - assert expected == preds - - class TestDate: inverse_normalizer_mr = InverseNormalizer(lang='mr', cache_dir=CACHE_DIR, overwrite_cache=False)