From f40a62d52abebacb29694455ce09ca3d3d7e3588 Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Wed, 28 Feb 2024 10:26:53 -0800 Subject: [PATCH 01/18] enable capitalized itn for es Signed-off-by: Mariana Graterol Fuenmayor --- .../es/data/dates/months_cased.tsv | 12 ++ .../es/data/dates/year_suffix_cased.tsv | 11 ++ .../es/data/measures/measurements_plural.tsv | 12 +- .../data/measures/measurements_singular.tsv | 12 +- .../currency_major_plural_capitalized.tsv | 72 ++++++++++ .../currency_major_singular_capitalized.tsv | 73 ++++++++++ .../es/data/ordinals/digit_capitalized.tsv | 22 +++ .../es/data/ordinals/hundreds_capitalized.tsv | 18 +++ .../es/data/ordinals/teen_capitalized.tsv | 60 ++++++++ .../es/data/ordinals/ties_capitalized.tsv | 15 ++ .../es/data/ordinals/twenties_capitalized.tsv | 50 +++++++ .../es/data/roman/digit.tsv | 18 +-- .../es/data/roman/hundreds.tsv | 18 +-- .../es/data/roman/thousands.tsv | 6 +- .../es/data/roman/ties.tsv | 18 +-- .../es/data/time/time_suffix_cased.tsv | 15 ++ .../es/data/time/time_zone.tsv | 84 ++++++------ .../es/data/time/time_zone_cased.tsv | 42 ++++++ .../es/data/whitelist.tsv | 28 ++-- .../es/taggers/cardinal.py | 85 ++++++++---- .../es/taggers/date.py | 23 +++- .../es/taggers/decimal.py | 50 +++++-- .../es/taggers/electronic.py | 128 +++++++++++++++--- .../es/taggers/fraction.py | 10 +- .../es/taggers/measure.py | 16 ++- .../es/taggers/money.py | 38 +++++- .../es/taggers/ordinal.py | 29 +++- .../es/taggers/telephone.py | 41 +++++- .../es/taggers/time.py | 60 +++++--- .../es/taggers/tokenize_and_classify.py | 24 ++-- .../es/taggers/whitelist.py | 53 +++++++- .../es/verbalizers/date.py | 6 +- .../es/verbalizers/time.py | 4 +- .../text_normalization/es/graph_utils.py | 3 + .../test_cases_cardinal_cased.txt | 30 ++++ .../test_cases_date_cased.txt | 8 ++ .../test_cases_decimal_cased.txt | 6 + .../test_cases_electronic_cased.txt | 5 + .../test_cases_measure.txt | 2 +- .../test_cases_measure_cased.txt | 11 ++ .../test_cases_money_cased.txt | 6 + .../test_cases_ordinal_cased.txt | 11 ++ .../test_cases_telephone_cased.txt | 6 + .../test_cases_time.txt | 2 +- .../test_cases_time_cased.txt | 9 ++ .../test_cases_whitelist.txt | 10 +- .../test_cases_word_cased.txt | 11 ++ .../nemo_text_processing/es/test_cardinal.py | 18 ++- tests/nemo_text_processing/es/test_date.py | 13 ++ tests/nemo_text_processing/es/test_decimal.py | 14 ++ .../es/test_electronic.py | 16 +++ .../nemo_text_processing/es/test_fraction.py | 16 +++ tests/nemo_text_processing/es/test_measure.py | 15 ++ tests/nemo_text_processing/es/test_money.py | 15 ++ tests/nemo_text_processing/es/test_ordinal.py | 14 ++ .../nemo_text_processing/es/test_telephone.py | 15 ++ tests/nemo_text_processing/es/test_time.py | 13 ++ tests/nemo_text_processing/es/test_word.py | 15 ++ 58 files changed, 1220 insertions(+), 217 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/es/data/dates/months_cased.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/es/data/dates/year_suffix_cased.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural_capitalized.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular_capitalized.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/es/data/ordinals/digit_capitalized.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/es/data/ordinals/hundreds_capitalized.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/es/data/ordinals/teen_capitalized.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/es/data/ordinals/ties_capitalized.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/es/data/ordinals/twenties_capitalized.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/es/data/time/time_suffix_cased.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/es/data/time/time_zone_cased.tsv create mode 100644 tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_cardinal_cased.txt create mode 100644 tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_date_cased.txt create mode 100644 tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_decimal_cased.txt create mode 100644 tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_electronic_cased.txt create mode 100644 tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_measure_cased.txt create mode 100644 tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_money_cased.txt create mode 100644 tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_ordinal_cased.txt create mode 100644 tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_telephone_cased.txt create mode 100644 tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_time_cased.txt create mode 100644 tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_word_cased.txt diff --git a/nemo_text_processing/inverse_text_normalization/es/data/dates/months_cased.tsv b/nemo_text_processing/inverse_text_normalization/es/data/dates/months_cased.tsv new file mode 100644 index 000000000..137183097 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/dates/months_cased.tsv @@ -0,0 +1,12 @@ +Enero +Febrero +Marzo +Abril +Mayo +Junio +Julio +Agosto +Septiembre +Octubre +Noviembre +Diciembre \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/dates/year_suffix_cased.tsv b/nemo_text_processing/inverse_text_normalization/es/data/dates/year_suffix_cased.tsv new file mode 100644 index 000000000..221fd3605 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/dates/year_suffix_cased.tsv @@ -0,0 +1,11 @@ +A. N. E. antes de nuestra era +A. E. C. antes de la era común +A. C. antes de Cristo +A. J. C. antes de Jesucristo +A. P. antes del presente +N. E. nuestra era +E. C. era común +D. C. después de Cristo +D. D. J. C. después de Jesucristo +B. C. B C +A. D. a d diff --git a/nemo_text_processing/inverse_text_normalization/es/data/measures/measurements_plural.tsv b/nemo_text_processing/inverse_text_normalization/es/data/measures/measurements_plural.tsv index e0f4284cc..1986f5cdc 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/measures/measurements_plural.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/measures/measurements_plural.tsv @@ -23,6 +23,12 @@ gsm g s m gsm ge ese eme psi p s i psi pe ese i -° c grados centígrados -° f grados farenheit -° k grados kelvin +° C grados centígrados +° F grados farenheit +° K grados kelvin +mb megabits +MB megabytes +gb gigabits +GB gigabytes +TB terabytes +PB petabytes \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/measures/measurements_singular.tsv b/nemo_text_processing/inverse_text_normalization/es/data/measures/measurements_singular.tsv index 22163c72f..13f977c83 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/measures/measurements_singular.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/measures/measurements_singular.tsv @@ -17,9 +17,9 @@ min minuto % por ciento % porciento s segundo -° c grado centígrado -° f grado farenheit -° k grado kelvin +° C grado centígrado +° F grado farenheit +° K grado kelvin mph milla por hora kph kilómetro por hora gsm gramo por metro cuadrado @@ -27,3 +27,9 @@ gsm g s m gsm ge ese eme psi p s i psi pe ese i +mb megabit +MB megabyte +gb gigabit +GB gigabyte +TB terabyte +PB petabyte \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural_capitalized.tsv b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural_capitalized.tsv new file mode 100644 index 000000000..129f641e5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural_capitalized.tsv @@ -0,0 +1,72 @@ +US$ Dólares Estadounidenses +US$ dólares Estadounidenses +US$ Dólares estadounidenses +US$ Dólares Americanos +US$ dólares Americanos +US$ Dólares americanos +AR$ Pesos Argentinos +AR$ pesos Argentinos +AR$ Pesos argentinos +BRL Reales Brasileños +BRL reales Brasileños +BRL Reales brasileños +CHF Francos Suizos +CHF francos Suizos +CHF Francos suizos +CLP Pesos Chilenos +CLP pesos Chilenos +CLP Pesos chilenos +CNY Yuan Chinos +CNY yuan Chinos +CNY Yuan chinos +COP Pesos Colombianos +COP pesos Colombianos +COP Pesos colombianos +CRC Colones Costarricenses +CRC colones Costarricenses +CRC Colones costarricenses +CUP Pesos Cubanos +CUP pesos Cubanos +CUP Pesos cubanos +RD$ Pesos Dominicanos +RD$ pesos Dominicanos +RD$ Pesos dominicanos +GBP Libras Esterlinas +GBP libras Esterlinas +GBP Libras esterlinas +HKD Dólares De Hong Kong +HKD dólares de Hong Kong +HKD Dólares de hong kong +INR Rupias Indias +INR rupias Indias +INR Rupias indias +Mex$ Pesos Mexicanos +Mex$ pesos Mexicanos +Mex$ Pesos mexicanos +SVC Colones Salvadoreños +SVC colones Salvadoreños +SVC Colones salvadoreños +UYU Pesos Uruguayos +UYU pesos Uruguayos +UYU Pesos uruguayos +VES Bolívares Soberanos +VES bolívares Soberanos +VES Bolívares soberanos +BOP Pesos Bolivianos +BOP pesos Bolivianos +BOP Pesos bolivianos +CLE Escudos Chilenos +CLE escudos Chilenos +CLE Escudos chilenos +ECS Sucres Ecuatorianos +ECS sucres Ecuatorianos +ECS Sucres ecuatorianos +PEH Soles De Oro +PEH soles de Oro +PEH Soles de oro +VEB Bolívares Venezolanos +VEB bolívares Venezolanos +VEB Bolívares venezolanos +VEF Bolívares Fuertes +VEF bolívares Fuertes +VEF Bolívares fuertes \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular_capitalized.tsv b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular_capitalized.tsv new file mode 100644 index 000000000..995741a2f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular_capitalized.tsv @@ -0,0 +1,73 @@ +US$ dólar Estadounidense +US$ Dólar Estadounidense +US$ Dólar estadounidense +US$ dólar Estadounidense +US$ dólar Americano +US$ Dólar Americano +US$ Dólar americano +AR$ peso Argentino +AR$ Peso Argentino +AR$ Peso argentino +BRL real Brasileño +BRL Real Brasileño +BRL Real brasileño +CHF franco Suizo +CHF Franco Suizo +CHF Franco suizo +CLP Peso Chileno +CLP peso Chileno +CLP Peso chileno +CNY Yuan Chino +CNY yuan Chino +CNY Yuan chino +COP Peso Colombiano +COP peso Colombiano +COP Peso colombiano +CRC Colón Costarricense +CRC colón Costarricense +CRC Colón costarricense +CUP Peso Cubano +CUP peso Cubano +CUP Peso cubano +RD$ Peso Dominicano +RD$ peso Dominicano +RD$ Peso dominicano +GBP Libra Esterlina +GBP libra Esterlina +GBP Libra esterlina +HKD Dólar De Hong Kong +HKD dólar de Hong Kong +HKD Dólar de hong kong +INR Rupia India +INR rupia India +INR Rupia india +Mex$ Peso Mexicano +Mex$ peso Mexicano +Mex$ Peso mexicano +SVC Colón Salvadoreño +SVC colón Salvadoreño +SVC Colón salvadoreño +UYU Peso Uruguayo +UYU peso Uruguayo +UYU Peso uruguayo +VES Bolívar Soberano +VES bolívar Soberano +VES Bolívar soberano +BOP Peso Boliviano +BOP peso Boliviano +BOP Peso boliviano +CLE Escudo Chileno +CLE escudo Chileno +CLE Escudo chileno +ECS Sucre Ecuatoriano +ECS sucre Ecuatoriano +ECS Sucre ecuatoriano +PEH Sol De Oro +PEH sol de Oro +PEH Sol de oro +VEB Bolívar Venezolano +VEB bolívar Venezolano +VEB Bolívar venezolano +VEF Bolívar Fuerte +VEF bolívar Fuerte +VEF Bolívar fuerte \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/ordinals/digit_capitalized.tsv b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/digit_capitalized.tsv new file mode 100644 index 000000000..459ea85e1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/digit_capitalized.tsv @@ -0,0 +1,22 @@ +Primero uno +Primera uno +Primer uno +Segundo dos +Segunda dos +Tercero tres +Tercera tres +Tercer tres +Cuarto cuatro +Cuarta cuatro +Quinto cinco +Quinta cinco +Sexto seis +Sexta seis +Séptimo siete +Séptima siete +Sétimo siete +Sétima siete +Octavo ocho +Octava ocho +Noveno nueve +Novena nueve diff --git a/nemo_text_processing/inverse_text_normalization/es/data/ordinals/hundreds_capitalized.tsv b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/hundreds_capitalized.tsv new file mode 100644 index 000000000..0172f8f63 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/hundreds_capitalized.tsv @@ -0,0 +1,18 @@ +Centésimo ciento +Centésima ciento +Ducentésimo doscientos +Ducentésima doscientos +Tricentésimo trescientos +Tricentésima trescientos +Cuadringentésimo cuatrocientos +Cuadringentésima cuatrocientos +Quingentésimo quinientos +Quingentésima quinientos +Sexcentésimo seiscientos +Sexcentésima seiscientos +Septingentésimo setecientos +Septingentésima setecientos +Octingentésimo ochocientos +Octingentésima ochocientos +Noningentésimo novecientos +Noningentésima novecientos diff --git a/nemo_text_processing/inverse_text_normalization/es/data/ordinals/teen_capitalized.tsv b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/teen_capitalized.tsv new file mode 100644 index 000000000..80f012ba5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/teen_capitalized.tsv @@ -0,0 +1,60 @@ +Décimo diez +Décima diez +Decimoprimero once +Decimoprimera once +Decimoprimer once +Décimo Primero once +Décima Primera once +Décimo Primera once +Décimo Primer once +Undécimo once +Undécima once +Decimosegundo doce +Decimosegunda doce +Décimo Segundo doce +Décima Segunda doce +Décimo Segunda doce +Duodécimo doce +Duodécima doce +Decimotercero trece +Decimotercera trece +Decimotercer trece +Décimo Tercero trece +Décima Tercera trece +Décimo Tercera trece +Décimo Tercer trece +Decimocuarto catorce +Decimocuarta catorce +Décimo Cuarto catorce +Décima Cuarta catorce +Décimo Cuarta catorce +Decimoquinto quince +Decimoquinta quince +Décimo Quinto quince +Décima Quinta quince +Décimo Quinta quince +Decimosexto dieciséis +Decimosexta dieciséis +Décimo Sexto dieciséis +Décima Sexta dieciséis +Décimo Sexta dieciséis +Decimoséptimo diecisiete +Decimoséptima diecisiete +Décimo Séptimo diecisiete +Décima Séptima diecisiete +Décimo Séptima diecisiete +Décimo Sétimo diecisiete +Décimo Sétima diecisiete +Décima Sétima diecisiete +Decimosétimo diecisiete +Decimosétima diecisiete +Decimoctavo dieciocho +Decimoctava dieciocho +Décimo Octavo dieciocho +Décima Octava dieciocho +Décimo Octava dieciocho +Decimonoveno diecinueve +Decimonovena diecinueve +Décimo Noveno diecinueve +Décima Novena diecinueve +Décimo Novena diecinueve diff --git a/nemo_text_processing/inverse_text_normalization/es/data/ordinals/ties_capitalized.tsv b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/ties_capitalized.tsv new file mode 100644 index 000000000..58e0eff28 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/ties_capitalized.tsv @@ -0,0 +1,15 @@ +Vigésimo veinte +Vigésima veinte +Trigésimo treinta +Cuadragésimo cuarenta +Cuadragésima cuarenta +Quincuagésimo cincuenta +Quincuagésima cincuenta +Sexagésimo sesenta +Sexagésima sesenta +Septuagésimo setenta +Septuagésima setenta +Octogésimo ochenta +Octogésima ochenta +Nonagésimo noventa +Nonagésima noventa diff --git a/nemo_text_processing/inverse_text_normalization/es/data/ordinals/twenties_capitalized.tsv b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/twenties_capitalized.tsv new file mode 100644 index 000000000..40e73e815 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/twenties_capitalized.tsv @@ -0,0 +1,50 @@ +Vigesimoprimero veintiuno +Vigesimoprimera veintiuno +Vigesimoprimer veintiuno +Vigésimo Primero veintiuno +Vigésimo Primera veintiuno +Vigésima Primera veintiuno +Vigésimo Primer veintiuno +Vigesimosegundo veintidós +Vigesimosegunda veintidós +Vigésimo Segundo veintidós +Vigésimo Segunda veintidós +Vigésima Segunda veintidós +Vigesimotercero veintitrés +Vigesimotercera veintitrés +Vigesimotercer veintitrés +Vigésimo Tercero veintitrés +Vigésimo Tercera veintitrés +Vigésima Tercera veintitrés +Vigésimo Tercer veintitrés +Vigesimocuarto veinticuatro +Vigesimocuarta veinticuatro +Vigésimo Cuarto veinticuatro +Vigésimo Cuarta veinticuatro +Vigésima Cuarta veinticuatro +Vigesimoquinto veinticinco +Vigesimoquinta veinticinco +Vigésimo Quinto veinticinco +Vigésimo Quinta veinticinco +Vigésima Quinta veinticinco +Vigesimosexto veintiséis +Vigesimosexta veintiséis +Vigésimo Sexto veintiséis +Vigésimo Sexta veintiséis +Vigésima Sexta veintiséis +Vigesimoséptimo veintisiete +Vigesimoséptima veintisiete +Vigésimo Séptimo veintisiete +Vigésimo Séptima veintisiete +Vigésima Séptima veintisiete +Vigesimoctavo veintiocho +Vigesimoctava veintiocho +Vigesimooctavo veintiocho +Vigesimooctava veintiocho +Vigésimo Octavo veintiocho +Vigésimo Octava veintiocho +Vigésima Octava veintiocho +Vigesimonoveno veintinueve +Vigesimonovena veintinueve +Vigésimo Noveno veintinueve +Vigésimo Novena veintinueve diff --git a/nemo_text_processing/inverse_text_normalization/es/data/roman/digit.tsv b/nemo_text_processing/inverse_text_normalization/es/data/roman/digit.tsv index 0610b4a54..e5fde2cc6 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/roman/digit.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/roman/digit.tsv @@ -1,9 +1,9 @@ -i 1 -ii 2 -iii 3 -iv 4 -v 5 -vi 6 -vii 7 -viii 8 -ix 9 \ No newline at end of file +I 1 +II 2 +III 3 +IV 4 +V 5 +VI 6 +VII 7 +VIII 8 +IX 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/roman/hundreds.tsv b/nemo_text_processing/inverse_text_normalization/es/data/roman/hundreds.tsv index cdbdb6814..5e04779be 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/roman/hundreds.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/roman/hundreds.tsv @@ -1,9 +1,9 @@ -c 1 -cc 2 -ccc 3 -cd 4 -d 5 -dc 6 -dcc 7 -dccc 8 -cm 9 \ No newline at end of file +C 1 +CC 2 +CCC 3 +CD 4 +D 5 +DC 6 +DCC 7 +DCCC 8 +CM 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/roman/thousands.tsv b/nemo_text_processing/inverse_text_normalization/es/data/roman/thousands.tsv index 19e96b9c6..164689802 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/roman/thousands.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/roman/thousands.tsv @@ -1,3 +1,3 @@ -m 1 -mm 2 -mmm 3 +M 1 +MM 2 +MMM 3 diff --git a/nemo_text_processing/inverse_text_normalization/es/data/roman/ties.tsv b/nemo_text_processing/inverse_text_normalization/es/data/roman/ties.tsv index ac043aa14..445773d91 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/roman/ties.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/roman/ties.tsv @@ -1,9 +1,9 @@ -x 1 -xx 2 -xxx 3 -xl 4 -l 5 -lx 6 -lxx 7 -lxxx 8 -xc 9 \ No newline at end of file +X 1 +XX 2 +XXX 3 +XL 4 +L 5 +LX 6 +LXX 7 +LXXX 8 +XC 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/time/time_suffix_cased.tsv b/nemo_text_processing/inverse_text_normalization/es/data/time/time_suffix_cased.tsv new file mode 100644 index 000000000..b04bd5193 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/time/time_suffix_cased.tsv @@ -0,0 +1,15 @@ +Peme P.M. +Pe Eme P.M. +P M P.M. +PM P.M. +P.M. +p.M P.M. +Ame A.M +A Eme A.M +AM A.M +A.M +A.M A.M +A M A.M +de la tarde P.M. +de la noche P.M. +de la mañana A.M diff --git a/nemo_text_processing/inverse_text_normalization/es/data/time/time_zone.tsv b/nemo_text_processing/inverse_text_normalization/es/data/time/time_zone.tsv index 6c86a3e4e..55f0297b0 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/time/time_zone.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/time/time_zone.tsv @@ -1,42 +1,42 @@ -utc u t c -cst c s t -cet c e t -pst p s t -est e s t -mdt m d t -mst m s t -pt p t -et e t -mt m t -gmt g m t -adt hora de verano del atlántico -amt hora estándar del amazonas -art hora estándar de argentina -ast hora estándar del atlántico -bot hora de bolivia -brt hora estándar de brasilia -clst hora de verano de chile -clt hora estándar de chile -cot hora estándar de colombia -east hora estándar de la isla de pascua -ect hora de ecuador -eeast hora de verano de la isla de pascua -eest hora de verano de europa oriental -eet hora estándar de europa oriental -fkst hora de verano de las malvinas -fnt hora estándar de fernando de noronha -galt hora de galápagos -gft hora de la guayana francesa -gyt hora de guyana -hkt hora estándar de hong kong -jst hora estándar de japón -kst hora estándar de corea -pet hora estándar de perú -pyst hora de verano de paraguay -pyt hora estándar de paraguay -sgt hora de singapur -uyst hora de verano de uruguay -uyt hora de uruguay -vet hora de venezuela -west hora de verano de europa oriental -wet hora estándar de europa oriental +UTC u t c +CST c s t +CET c e t +PST p s t +EST e s t +MDT m d t +MST m s t +PT p t +ET e t +MT m t +GMT g m t +ADT hora de verano del atlántico +AMT hora estándar del amazonas +ART hora estándar de argentina +AST hora estándar del atlántico +BOT hora de bolivia +BRT hora estándar de brasilia +CLST hora de verano de chile +CLT hora estándar de chile +COT hora estándar de colombia +EAST hora estándar de la isla de pascua +ECT hora de ecuador +EEAST hora de verano de la isla de pascua +EEST hora de verano de europa oriental +EET hora estándar de europa oriental +FKST hora de verano de las malvinas +FNT hora estándar de fernando de noronha +GALT hora de galápagos +GFT hora de la guayana francesa +GYT hora de guyana +HKT hora estándar de hong kong +JST hora estándar de japón +KST hora estándar de corea +PET hora estándar de perú +PYST hora de verano de paraguay +PYT hora estándar de paraguay +SGT hora de singapur +UYST hora de verano de uruguay +UYT hora de uruguay +VET hora de venezuela +WEST hora de verano de europa oriental +WET hora estándar de europa oriental \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/time/time_zone_cased.tsv b/nemo_text_processing/inverse_text_normalization/es/data/time/time_zone_cased.tsv new file mode 100644 index 000000000..e635698d3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/time/time_zone_cased.tsv @@ -0,0 +1,42 @@ +UTC U T C +CST C S T +CET C E T +PST P S T +EST E S T +MDT M D T +MST M S T +PT P T +ET E T +MT M T +GMT G M T +ADT Hora de Verano del Atlántico +AMT Hora Estándar del Amazonas +ART Hora Estándar de Argentina +AST Hora Estándar del Atlántico +BOT Hora de Bolivia +BRT Hora Estándar de Brasilia +CLST Hora de Verano de Chile +CLT Hora Estándar de Chile +COT Hora Estándar de Colombia +EAST Hora Estándar de la Isla de Pascua +ECT Hora de Ecuador +EEAST Hora de Verano de la Isla de Pascua +EEST Hora de Verano de Europa Oriental +EET Hora Estándar de Europa Oriental +FKST Hora de Verano de Las Malvinas +FNT Hora Estándar de Fernando de Noronha +GALT Hora de Galápagos +GFT Hora de la Guayana Francesa +GYT Hora de Guyana +HKT Hora Estándar de Hong Kong +JST Hora Estándar de Japón +KST Hora Estándar de Corea +PET Hora Estándar de Perú +PYST Hora de Verano de Paraguay +PYT Hora Estándar de Paraguay +SGT Hora de Singapur +UYST Hora de Verano de Uruguay +UYT Hora de Uruguay +VET Hora de Venezuela +WEST Hora de Verano de Europa Oriental +WET Hora Estándar de Europa Oriental \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/es/data/whitelist.tsv index 60253820a..8d81f2c09 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/whitelist.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/whitelist.tsv @@ -1,16 +1,16 @@ -ud. usted -uds. ustedes -vd. vosotros -vds. vosotros -dr. doctor -dra. doctora -d. don -da. doña -ee. uu. estados unidos +Ud. usted +Uds. ustedes +Vd. vosotros +Vds. vosotros +Dr. doctor +Dra. doctora +D. don +Da. doña +EE. UU. estados unidos p.ej. por ejemplo -prof. profesor -profa. profesora -sr. señor -sra. señora -srta. señorita +Prof. profesor +Profa. profesora +Sr. señor +Sra. señora +Srta. señorita etc. etcétera diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py index 07a1e8316..8b007a5ef 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py @@ -15,7 +15,16 @@ import pynini from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, NEMO_SPACE, GraphFst, delete_space +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, + NEMO_DIGIT, + NEMO_SPACE, + GraphFst, + capitalized_input_graph, + delete_space, +) +from nemo_text_processing.text_normalization.es.graph_utils import ES_MINUS from pynini.lib import pynutil @@ -34,10 +43,15 @@ class CardinalFst(GraphFst): inside cardinal numbers). e.g. "mil y una" -> cardinal { integer: "1001"} e.g. "ciento y una" -> cardinal { integer: "101"} + + Args: + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self): + def __init__(self, input_case: str = INPUT_LOWER_CASED): super().__init__(name="cardinal", kind="classify") + self.input_case = input_case + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) @@ -46,7 +60,7 @@ def __init__(self): graph_hundreds = pynini.string_file(get_abs_path("data/numbers/hundreds.tsv")) full_graph_ties = (graph_ties | pynutil.insert("0")) + ( - (delete_space + pynutil.delete("y") + delete_space + graph_digit) | pynutil.insert("0") + (delete_space + self.delete_word("y") + delete_space + graph_digit) | pynutil.insert("0") ) graph_hundred_component = graph_hundreds | pynutil.insert("0") @@ -60,27 +74,27 @@ def __init__(self): ) self.graph_hundred_component_at_least_one_none_zero_digit = ( graph_hundred_component_at_least_one_none_zero_digit - ) + ).optimize() graph_thousands = pynini.union( - graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("mil"), - pynutil.insert("001") + pynutil.delete("mil"), # because we say 'mil', not 'un mil' + graph_hundred_component_at_least_one_none_zero_digit + delete_space + self.delete_word("mil"), + pynutil.insert("001") + self.delete_word("mil"), # because we say 'mil', not 'un mil' pynutil.insert("000", weight=0.1), ) graph_millones = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space - + (pynutil.delete("millones") | pynutil.delete("millón")), - pynutil.insert("000") + pynutil.delete("millones"), # to allow for 'mil millones' + + (self.delete_word("millones") | self.delete_word("millón")), + pynutil.insert("000") + self.delete_word("millones"), # to allow for 'mil millones' ) graph_mil_millones = pynini.union( - graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("mil"), - pynutil.insert("001") + pynutil.delete("mil"), # because we say 'mil', not 'un mil' + graph_hundred_component_at_least_one_none_zero_digit + delete_space + self.delete_word("mil"), + pynutil.insert("001") + self.delete_word("mil"), # because we say 'mil', not 'un mil' ) graph_mil_millones += delete_space + ( - graph_millones | pynutil.insert("000") + pynutil.delete("millones") + graph_millones | pynutil.insert("000") + self.delete_word("millones") ) # allow for 'mil millones' graph_mil_millones |= pynutil.insert("000000", weight=0.1) @@ -88,36 +102,36 @@ def __init__(self): graph_millardo = ( graph_hundred_component_at_least_one_none_zero_digit + delete_space - + (pynutil.delete("millardo") | pynutil.delete("millardos")) + + (self.delete_word("millardo") | self.delete_word("millardos")) ) graph_billones = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space - + (pynutil.delete("billones") | pynutil.delete("billón")), + + (self.delete_word("billones") | self.delete_word("billón")), ) graph_mil_billones = pynini.union( - graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("mil"), - pynutil.insert("001") + pynutil.delete("mil"), # because we say 'mil', not 'un mil' + graph_hundred_component_at_least_one_none_zero_digit + delete_space + self.delete_word("mil"), + pynutil.insert("001") + self.delete_word("mil"), # because we say 'mil', not 'un mil' ) graph_mil_billones += delete_space + ( - graph_billones | pynutil.insert("000") + pynutil.delete("billones") + graph_billones | pynutil.insert("000") + self.delete_word("billones") ) # allow for 'mil billones' graph_mil_billones |= pynutil.insert("000000", weight=0.1) graph_trillones = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space - + (pynutil.delete("trillones") | pynutil.delete("trillón")), + + (self.delete_word("trillones") | self.delete_word("trillón")), ) graph_mil_trillones = pynini.union( - graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("mil"), - pynutil.insert("001") + pynutil.delete("mil"), # because we say 'mil', not 'un mil' + graph_hundred_component_at_least_one_none_zero_digit + delete_space + self.delete_word("mil"), + pynutil.insert("001") + self.delete_word("mil"), # because we say 'mil', not 'un mil' ) graph_mil_trillones += delete_space + ( - graph_trillones | pynutil.insert("000") + pynutil.delete("trillones") + graph_trillones | pynutil.insert("000") + self.delete_word("trillones") ) # allow for 'mil trillones' graph_mil_trillones |= pynutil.insert("000000", weight=0.1) @@ -143,12 +157,12 @@ def __init__(self): pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0" ) - self.graph_no_exception = graph + self.graph_no_exception = graph.optimize() # save self.numbers_up_to_thousand for use in DecimalFst digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3) - numbers_up_to_thousand = pynini.compose(graph, digits_up_to_thousand).optimize() - self.numbers_up_to_thousand = numbers_up_to_thousand + numbers_up_to_thousand = pynini.compose(self.graph_no_exception, digits_up_to_thousand).optimize() + self.numbers_up_to_thousand = numbers_up_to_thousand.optimize() # save self.numbers_up_to_million for use in DecimalFst digits_up_to_million = ( @@ -160,18 +174,37 @@ def __init__(self): | (NEMO_DIGIT ** 6) ) numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize() - self.numbers_up_to_million = numbers_up_to_million + self.numbers_up_to_million = numbers_up_to_million.optimize() + + + if input_case == INPUT_CASED: + graph |= capitalized_input_graph(graph) + graph_digit |= capitalized_input_graph(graph_digit) + graph_zero |= capitalized_input_graph(graph_zero) + # graph_exception = capitalized_input_graph(graph_exception) + self.graph_no_exception |= capitalized_input_graph(self.graph_no_exception).optimize() + self.numbers_up_to_thousand |= capitalized_input_graph(self.numbers_up_to_thousand).optimize() # don't convert cardinals from zero to nine inclusive graph_exception = pynini.project(pynini.closure(NEMO_SPACE, 0, 1) + (graph_digit | graph_zero), 'input') - self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph + self.graph = ((pynini.project(graph, "input") - graph_exception.arcsort()) @ graph).optimize() optional_minus_graph = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("menos", "\"-\"") + NEMO_SPACE, 0, 1 + pynutil.insert("negative: ") + pynini.cross(ES_MINUS, "\"-\"") + NEMO_SPACE, 0, 1 ) final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() + + + def delete_word(self, word: str): + """ Capitalizes word for `cased` input""" + delete_graph = pynutil.delete(word).optimize() + if self.input_case == INPUT_CASED: + if len(word) > 0: + delete_graph |= pynutil.delete(word[0].upper() + word[1:]) + + return delete_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/date.py b/nemo_text_processing/inverse_text_normalization/es/taggers/date.py index 3100e6a50..8b749bd8f 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/date.py @@ -15,7 +15,14 @@ import pynini from nemo_text_processing.inverse_text_normalization.es.graph_utils import int_to_roman from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, + GraphFst, + capitalized_input_graph, + delete_extra_space, + delete_space, +) from pynini.lib import pynutil @@ -24,9 +31,13 @@ class DateFst(GraphFst): Finite state transducer for classifying date, e.g. primero de enero -> date { day: "1" month: "enero" } e.g. uno de enero -> date { day: "1" month: "enero" } + + Args: + cardinal: CardinalFst + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self, cardinal: GraphFst): + def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): super().__init__(name="date", kind="classify") graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) @@ -37,6 +48,10 @@ def __init__(self, cardinal: GraphFst): graph_month = pynini.string_file(get_abs_path("data/dates/months.tsv")) graph_suffix = pynini.string_file(get_abs_path("data/dates/year_suffix.tsv")).invert() + if input_case == INPUT_CASED: + graph_month |= pynini.string_file(get_abs_path("data/dates/months_cased.tsv")) + graph_suffix |= pynini.string_file(get_abs_path("data/dates/year_suffix_cased.tsv")).invert() + graph_1_to_100 = pynini.union( graph_digit, graph_twenties, @@ -67,5 +82,9 @@ def __init__(self, cardinal: GraphFst): final_graph = graph_dm | roman_centuries_graph | year_with_suffix_graph final_graph += pynutil.insert(" preserve_order: true") + + if input_case == INPUT_CASED: + final_graph |= capitalized_input_graph(final_graph) + final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py index bdbf18049..64fbd3acb 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py @@ -15,15 +15,24 @@ import pynini from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, + MIN_NEG_WEIGHT, NEMO_DIGIT, + NEMO_SIGMA, + TO_LOWER, GraphFst, + capitalized_input_graph, delete_extra_space, delete_space, ) +from nemo_text_processing.text_normalization.es.graph_utils import ES_MINUS from pynini.lib import pynutil -def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_million: 'pynini.FstLike') -> 'pynini.FstLike': +def get_quantity( + decimal: 'pynini.FstLike', cardinal_up_to_million: 'pynini.FstLike', input_case: str = INPUT_LOWER_CASED + ) -> 'pynini.FstLike': """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. one million -> integer_part: "1" quantity: "million" @@ -32,12 +41,13 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_million: 'pynini.FstL Args: decimal: decimal FST cardinal_up_to_million: cardinal FST + input_case: accepting either "lower_cased" or "cased" input. """ numbers = cardinal_up_to_million @ ( pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) ) - suffix = pynini.union( + suffix_labels = [ "millón", "millones", "millardo", @@ -48,7 +58,12 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_million: 'pynini.FstL "trillones", "cuatrillón", "cuatrillones", - ) + ] + suffix = pynini.union(*suffix_labels) + + if input_case == INPUT_CASED: + suffix |= pynini.union(*[x[0].upper() + x[1:] for x in suffix_labels]).optimize() + res = ( pynutil.insert("integer_part: \"") + numbers @@ -78,23 +93,29 @@ class DecimalFst(GraphFst): e.g. mil ochocientos veinticuatro millones -> decimal { negative: "false" integer_part: "1824" quantity: "millones" } Args: cardinal: CardinalFst + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self, cardinal: GraphFst): + def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): super().__init__(name="decimal", kind="classify") # number after decimal point can be any series of cardinals <1000, including 'zero' graph_decimal = cardinal.numbers_up_to_thousand graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal - self.graph = graph_decimal + self.graph = graph_decimal.optimize() # decimal point can be denoted by 'coma' or 'punto' decimal_point = pynini.cross("coma", "morphosyntactic_features: \",\"") decimal_point |= pynini.cross("punto", "morphosyntactic_features: \".\"") + if input_case == INPUT_CASED: + decimal_point |= pynini.cross("Coma", "morphosyntactic_features: \",\"") + decimal_point |= pynini.cross("Punto", "morphosyntactic_features: \".\"") + + optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("menos", "\"true\"") + delete_extra_space, 0, 1 + pynutil.insert("negative: ") + pynini.cross(ES_MINUS, "\"true\"") + delete_extra_space, 0, 1 ) graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"") @@ -110,8 +131,21 @@ def __init__(self, cardinal: GraphFst): final_graph = optional_graph_negative + final_graph_wo_sign self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( - final_graph_wo_sign, cardinal.numbers_up_to_million + final_graph_wo_sign, cardinal.numbers_up_to_million, input_case=input_case + ).optimize() + + # accept semiotic spans that start with a capital letter + self.final_graph_wo_negative |= pynutil.add_weight( + pynini.compose(TO_LOWER + NEMO_SIGMA, self.final_graph_wo_negative), MIN_NEG_WEIGHT + ).optimize() + + quantity_graph = get_quantity( + final_graph_wo_sign, cardinal.numbers_up_to_million, input_case=input_case ) - final_graph |= optional_graph_negative + get_quantity(final_graph_wo_sign, cardinal.numbers_up_to_million) + final_graph |= optional_graph_negative + quantity_graph + + if input_case == INPUT_CASED: + final_graph |= capitalized_input_graph(final_graph) + final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py index 53b6b4d09..0b4e63da4 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py @@ -13,8 +13,18 @@ # limitations under the License. import pynini +from nemo_text_processing.inverse_text_normalization.en.utils import get_various_formats from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, GraphFst, insert_space +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, + MIN_POS_WEIGHT, + NEMO_ALPHA, + GraphFst, + capitalized_input_graph, + insert_space, +) +from nemo_text_processing.text_normalization.en.utils import load_labels from pynini.lib import pynutil @@ -25,22 +35,36 @@ class ElectronicFst(GraphFst): and URLS (which get converted to a "protocol" field). e.g. c d f uno arroba a b c punto e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } } e.g. doble ve doble ve doble ve a b c punto e d u -> tokens { electronic { protocol: "www.abc.edu" } } + + Args: + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self): + def __init__(self, input_case: str = INPUT_LOWER_CASED): super().__init__(name="electronic", kind="classify") delete_extra_space = pynutil.delete(" ") - alpha_num = ( - NEMO_ALPHA - | pynini.string_file(get_abs_path("data/numbers/digit.tsv")) - | pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + + num = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) | pynini.string_file( + get_abs_path("data/numbers/zero.tsv") ) + if input_case == INPUT_CASED: + num = capitalized_input_graph(num) + + alpha_num = (NEMO_ALPHA | num).optimize() symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).invert() + if input_case == INPUT_CASED: + symbols = capitalized_input_graph(symbols) accepted_username = alpha_num | symbols - process_dot = pynini.cross("punto", ".") + dot = pynini.accep("punto") + if input_case == INPUT_CASED: + dot |= pynini.accep("Punto") + process_dot = pynini.cross(dot, ".") + alternative_dot = ( + pynini.closure(delete_extra_space, 0, 1) + pynini.accep(".") + pynini.closure(delete_extra_space, 0, 1) + ) username = ( pynutil.insert("username: \"") + alpha_num @@ -50,47 +74,109 @@ def __init__(self): + pynutil.insert("\"") ) single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num - server = single_alphanum | pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).invert() - domain = single_alphanum | pynini.string_file(get_abs_path("data/electronic/domain.tsv")).invert() + + server_names = pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).invert() + if input_case == INPUT_CASED: + server_names = capitalized_input_graph(server_names) + server = ( + single_alphanum + | server_names + | pynini.closure(NEMO_ALPHA, 2) + ) + + if input_case == INPUT_CASED: + domain = [] + # get domain formats + for d in load_labels(get_abs_path("data/electronic/domain.tsv")): + domain.extend(get_various_formats(d[0])) + domain = pynini.string_map(domain).optimize() + else: + domain = pynini.string_file(get_abs_path("data/electronic/domain.tsv")).invert() + + domain = pynutil.add_weight(single_alphanum, weight=-0.0001) | domain | pynini.closure(NEMO_ALPHA, 2) + domain_graph = ( pynutil.insert("domain: \"") + server - + delete_extra_space - + process_dot - + delete_extra_space + + ((delete_extra_space + process_dot + delete_extra_space) | alternative_dot) + domain + pynutil.insert("\"") ) + + at = pynini.accep("arroba") + if input_case == INPUT_CASED: + at |= pynini.accep("Arroba") + graph = ( - username + delete_extra_space + pynutil.delete("arroba") + insert_space + delete_extra_space + domain_graph + username + delete_extra_space + pynutil.delete(at) + insert_space + delete_extra_space + domain_graph ) ############# url ### - protocol_end = pynini.cross(pynini.union("www", "w w w", "doble ve doble ve doble ve"), "www") - protocol_start = pynini.cross(pynini.union("http", "h t t p", "hache te te pe"), "http") - protocol_start |= pynini.cross(pynini.union("https", "h t t p s", "hache te te pe ese"), "https") + if input_case == INPUT_CASED: + spoken_ws = pynini.union( + "doble ve doble ve doble ve", + "Doble Ve Doble Ve Doble Ve", + "Doble ve doble ve doble ve" + ) + protocol_end = pynini.cross(pynini.union(*get_various_formats("www")) | spoken_ws, "www") + + spoken_http = pynini.union( + "hache te te pe", + "Hache te te pe", + "Hache Te Te Pe" + ) + spoken_https = pynini.union( + "hache te te pe ese", + "Hache te te pe ese", + "Hache Te Te Pe Ese" + ) + protocol_start = pynini.cross(pynini.union(*get_various_formats("http")) | spoken_http, "http") | pynini.cross( + pynini.union(*get_various_formats("https")) | spoken_https, "https" + ) + else: + protocol_end = pynutil.add_weight(pynini.cross(pynini.union("www", "w w w", "doble ve doble ve doble ve"), "www"), MIN_POS_WEIGHT) + protocol_start = pynutil.add_weight(pynini.cross(pynini.union("http", "h t t p", "hache te te pe"), "http"), MIN_POS_WEIGHT) + protocol_start |= pynutil.add_weight(pynini.cross(pynini.union("https", "h t t p s", "hache te te pe ese"), "https"), MIN_POS_WEIGHT) + protocol_start += pynini.cross(" dos puntos barra barra ", "://") # e.g. .com, .es ending = ( delete_extra_space - + symbols + + symbols + delete_extra_space + (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username) ) + protocol_default = ( + ( + (pynini.closure(delete_extra_space + accepted_username, 1) | server) + | pynutil.add_weight(pynini.closure(NEMO_ALPHA, 1), weight=0.001) + ) + + pynini.closure(ending, 1) + ).optimize() + protocol = ( pynini.closure(protocol_start, 0, 1) + protocol_end + delete_extra_space + process_dot + delete_extra_space - + (pynini.closure(delete_extra_space + accepted_username, 1) | server) - + pynini.closure(ending, 1) - ) + + protocol_default + ).optimize() + + if input_case == INPUT_CASED: + protocol |= ( + pynini.closure(protocol_start, 0, 1) + protocol_end + alternative_dot + protocol_default + ).optimize() + + protocol |= pynini.closure(protocol_end + delete_extra_space + process_dot, 0, 1) + protocol_default + protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert("\"") graph |= protocol - ######## + + if input_case == INPUT_CASED: + graph = capitalized_input_graph(graph, capitalized_graph_weight=MIN_POS_WEIGHT) final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py index f31d984eb..de1ea519f 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py @@ -15,7 +15,8 @@ import pynini from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, NEMO_SPACE, GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import INPUT_LOWER_CASED, NEMO_SIGMA, NEMO_SPACE, GraphFst +from nemo_text_processing.text_normalization.es.graph_utils import ES_MINUS from pynini.lib import pynutil @@ -37,9 +38,10 @@ class FractionFst(GraphFst): Args: cardinal: CardinalFst ordinal: OrdinalFst + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self, cardinal: GraphFst, ordinal: GraphFst): + def __init__(self, cardinal: GraphFst, ordinal: GraphFst, input_case: str = INPUT_LOWER_CASED): super().__init__(name="fraction", kind="classify") cardinal_graph = cardinal.graph_no_exception @@ -74,7 +76,7 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst): # process negative fractions # e.g. "menos dos tercios" -> "fractions { negative: True numerator: "2" denominator: "3" }" - optional_negative_graph = pynini.closure(pynini.cross("menos", "negative: \"True\"") + NEMO_SPACE, 0, 1) + optional_negative_graph = pynini.closure(pynini.cross(ES_MINUS, "negative: \"True\"") + NEMO_SPACE, 0, 1) # process mixed fractions # e.g. "dos y dos tercios" -> "fractions { integer_part: "2" numerator: "2" denominator: "3" }" @@ -96,7 +98,7 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst): ) proper_fractions_with_medio = optional_negative_graph + proper_fractions_with_medio - self.proper_fractions_with_medio = self.add_tokens(proper_fractions_with_medio) + self.proper_fractions_with_medio = self.add_tokens(proper_fractions_with_medio).optimize() graph = ( optional_negative_graph + optional_integer_part_graph + numerators_graph + NEMO_SPACE + denominators_graph diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py index 6aea36ede..16f0b0073 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py @@ -15,13 +15,17 @@ import pynini from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, NEMO_ALPHA, NEMO_SIGMA, + TO_LOWER, GraphFst, convert_space, delete_extra_space, delete_space, ) +from nemo_text_processing.text_normalization.es.graph_utils import ES_MINUS from pynini.lib import pynutil @@ -33,9 +37,11 @@ class MeasureFst(GraphFst): Args: cardinal: CardinalFst decimal: DecimalFst + fraction: FractionFst + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst): + def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, input_case: str = INPUT_LOWER_CASED): super().__init__(name="measure", kind="classify") cardinal_graph = cardinal.graph_no_exception @@ -45,13 +51,19 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst): math_symbols = pynini.string_file(get_abs_path("data/measures/math_symbols.tsv")) equal_symbol = pynini.string_map([("es igual a", "="), ("igual a", "=")]) + # accept capital letters in units + casing_graph = pynini.closure(TO_LOWER | NEMO_SIGMA).optimize() + graph_unit_singular = pynini.string_file(get_abs_path("data/measures/measurements_singular.tsv")) graph_unit_singular = pynini.invert(graph_unit_singular) # singular -> abbr + graph_unit_singular = pynini.compose(casing_graph, graph_unit_singular).optimize() + graph_unit_plural = pynini.string_file(get_abs_path("data/measures/measurements_plural.tsv")) graph_unit_plural = pynini.invert(graph_unit_plural) # plural -> abbr + graph_unit_plural = pynini.compose(casing_graph, graph_unit_plural).optimize() optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("menos", "\"true\"") + delete_extra_space, 0, 1 + pynutil.insert("negative: ") + pynini.cross(ES_MINUS, "\"true\"") + delete_extra_space, 0, 1 ) unit_singular = convert_space(graph_unit_singular) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/money.py b/nemo_text_processing/inverse_text_normalization/es/taggers/money.py index 357cc8e08..b78657fc6 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/money.py @@ -15,9 +15,12 @@ import pynini from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, NEMO_DIGIT, NEMO_SIGMA, GraphFst, + capitalized_input_graph, convert_space, delete_extra_space, delete_space, @@ -34,9 +37,10 @@ class MoneyFst(GraphFst): Args: cardinal: CardinalFst decimal: DecimalFst + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst): + def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPUT_LOWER_CASED): super().__init__(name="money", kind="classify") # quantity, integer_part, fractional_part, currency @@ -53,6 +57,18 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): unit_minor_plural = pynini.string_file(get_abs_path("data/money/currency_minor_plural.tsv")) unit_minor_plural = pynini.invert(unit_minor_plural) + if input_case == INPUT_CASED: + unit_singular = capitalized_input_graph(unit_singular) + unit_singular_capitalized = pynini.string_file(get_abs_path("data/money/currency_major_singular_capitalized.tsv")) + unit_singular |= pynini.invert(unit_singular_capitalized).optimize() + + unit_plural = capitalized_input_graph(unit_plural) + unit_plural_capitalized = pynini.string_file(get_abs_path("data/money/currency_major_plural_capitalized.tsv")) + unit_plural |= pynini.invert(unit_plural_capitalized).optimize() + + unit_minor_singular = capitalized_input_graph(unit_minor_singular).optimize() + unit_minor_plural = capitalized_input_graph(unit_minor_plural).optimize() + graph_unit_singular = pynutil.insert("currency: \"") + convert_space(unit_singular) + pynutil.insert("\"") graph_unit_plural = pynutil.insert("currency: \"") + convert_space(unit_plural) + pynutil.insert("\"") @@ -65,22 +81,30 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) + one_graph = pynini.union("un", "una").optimize() + if input_case == INPUT_CASED: + one_graph |= pynini.union("Un", "Una").optimize() + # twelve dollars (and) fifty cents, zero cents cents_standalone = ( pynutil.insert("morphosyntactic_features: \",\"") # always use a comma in the decimal + insert_space + pynutil.insert("fractional_part: \"") + pynini.union( - pynutil.add_weight(((NEMO_SIGMA - "un") @ cardinal_graph), -0.7) @ add_leading_zero_to_double_digit + pynutil.add_weight(((NEMO_SIGMA - one_graph) @ cardinal_graph), -0.7) @ add_leading_zero_to_double_digit + delete_space, - pynini.cross("un", "01") + delete_space, + pynini.cross(one_graph, "01") + delete_space, ) + pynutil.insert("\"") ) + and_graph = pynini.union("con", "y").optimize() + if input_case == INPUT_CASED: + and_graph |= pynini.union("Con", "Y").optimize() + optional_cents_standalone = pynini.closure( delete_space - + pynini.closure((pynutil.delete("con") | pynutil.delete('y')) + delete_space, 0, 1) + + pynini.closure(pynutil.delete(and_graph) + delete_space, 0, 1) + insert_space + cents_standalone + pynutil.delete(pynini.union(unit_minor_singular, unit_minor_plural)), @@ -95,7 +119,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + pynutil.insert("morphosyntactic_features: \",\"") # always use a comma in the decimal + insert_space + pynutil.insert("fractional_part: \"") - + pynini.closure(pynutil.delete("con") + delete_space, 0, 1) + + pynini.closure(pynutil.delete(pynini.union("con", "Con")) + delete_space, 0, 1) + pynutil.add_weight(cardinal_graph @ add_leading_zero_to_double_digit, -0.7) + pynutil.insert("\""), 0, @@ -104,7 +128,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): graph_integer = ( pynutil.insert("integer_part: \"") - + ((NEMO_SIGMA - "un" - "una") @ cardinal_graph) + + ((NEMO_SIGMA - one_graph) @ cardinal_graph) + pynutil.insert("\"") + delete_extra_space + graph_unit_plural @@ -112,7 +136,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) graph_integer |= ( pynutil.insert("integer_part: \"") - + (pynini.cross("un", "1") | pynini.cross("una", "1")) + + pynini.cross(one_graph, "1") + pynutil.insert("\"") + delete_extra_space + graph_unit_singular diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py index 207bbbaf6..dc660c5ec 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py @@ -14,7 +14,14 @@ import pynini from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst, delete_space +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, + NEMO_SIGMA, + GraphFst, + capitalized_input_graph, + delete_space, +) from pynini.lib import pynutil @@ -33,9 +40,10 @@ class OrdinalFst(GraphFst): Args: cardinal: CardinalFst + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self, cardinal: GraphFst): + def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): super().__init__(name="ordinal", kind="classify") cardinal_graph = cardinal.graph_no_exception @@ -45,6 +53,13 @@ def __init__(self, cardinal: GraphFst): graph_ties = pynini.string_file(get_abs_path("data/ordinals/ties.tsv")) graph_hundreds = pynini.string_file(get_abs_path("data/ordinals/hundreds.tsv")) + if input_case == INPUT_CASED: + graph_digit |= pynini.string_file(get_abs_path("data/ordinals/digit_capitalized.tsv")).optimize() + graph_teens |= pynini.string_file(get_abs_path("data/ordinals/teen_capitalized.tsv")).optimize() + graph_twenties |= pynini.string_file(get_abs_path("data/ordinals/twenties_capitalized.tsv")).optimize() + graph_ties |= pynini.string_file(get_abs_path("data/ordinals/ties_capitalized.tsv")).optimize() + graph_hundreds |= pynini.string_file(get_abs_path("data/ordinals/hundreds_capitalized.tsv")).optimize() + full_graph_ties = graph_ties | (graph_ties + pynini.cross(" ", "y") + graph_digit) ordinal_graph_union = pynini.union(graph_digit, graph_teens, graph_twenties, full_graph_ties, graph_hundreds,) @@ -64,7 +79,7 @@ def __init__(self, cardinal: GraphFst): graph_a_suffix = (optional_numbers_in_front + ordinal_graph_a) @ cardinal_graph graph_er_suffix = (optional_numbers_in_front + ordinal_graph_er) @ cardinal_graph - self.graph_masc_num_no_exception = graph_o_suffix + self.graph_masc_num_no_exception = graph_o_suffix.optimize() # don't convert ordinals from one to nine inclusive graph_exception = pynini.project(pynini.union(graph_digit), 'input') @@ -72,6 +87,12 @@ def __init__(self, cardinal: GraphFst): graph_a_suffix = (pynini.project(graph_a_suffix, "input") - graph_exception.arcsort()) @ graph_a_suffix graph_er_suffix = (pynini.project(graph_er_suffix, "input") - graph_exception.arcsort()) @ graph_er_suffix + if input_case == INPUT_CASED: + graph_exception = capitalized_input_graph(graph_exception) + graph_o_suffix = capitalized_input_graph(graph_o_suffix) + graph_a_suffix = capitalized_input_graph(graph_a_suffix) + graph_er_suffix = capitalized_input_graph(graph_er_suffix) + graph = ( pynutil.insert("integer: \"") + graph_o_suffix @@ -90,6 +111,6 @@ def __init__(self, cardinal: GraphFst): + pynutil.insert("\"") + pynutil.insert(" morphosyntactic_features: \"er\"") ) - + final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py index 0f6b5f003..3bf632c74 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py @@ -14,7 +14,15 @@ import pynini from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_space +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, + MIN_NEG_WEIGHT, + GraphFst, + capitalized_input_graph, + delete_space, +) +from nemo_text_processing.text_normalization.es.graph_utils import ES_PLUS from pynini.lib import pynutil @@ -30,9 +38,12 @@ class TelephoneFst(GraphFst): "twelve thirty four" = "1234". (we ignore more complicated cases such as "three hundred and two" or "three nines"). + + Args: + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self): + def __init__(self, input_case: str = INPUT_LOWER_CASED): super().__init__(name="telephone", kind="classify") # create `single_digits` and `double_digits` graphs as these will be @@ -41,8 +52,16 @@ def __init__(self): graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_twenties = pynini.string_file(get_abs_path("data/numbers/twenties.tsv")) + graph_zero = pynini.cross("cero", "0") - single_digits = graph_digit.optimize() | pynini.cross("cero", "0") + if input_case == INPUT_CASED: + graph_digit = capitalized_input_graph(graph_digit) + graph_ties = capitalized_input_graph(graph_ties) + graph_teen = capitalized_input_graph(graph_teen) + graph_twenties = capitalized_input_graph(graph_twenties) + graph_zero = pynini.cross(pynini.union("cero", "Cero"), "0").optimize() + + single_digits = graph_digit.optimize() | graph_zero double_digits = pynini.union( graph_twenties, @@ -57,7 +76,7 @@ def __init__(self): digit_thrice = digit_twice + pynutil.delete(" ") + single_digits # accept `doble cero` -> `00` and `triple ocho` -> `888` - digit_words = pynini.union(graph_digit.optimize(), pynini.cross("cero", "0")).invert() + digit_words = pynini.union(graph_digit.optimize(), graph_zero).invert() doubled_digit = pynini.union( *[ @@ -111,14 +130,21 @@ def __init__(self): # 8-digit option eight_digit_graph = group_of_four + insert_separator + group_of_four + plus = pynini.accep("más") + if input_case == INPUT_CASED: + plus |= ES_PLUS + # optionally denormalize country codes optional_country_code = pynini.closure( - pynini.cross("más ", "+") + (single_digits | group_of_two | group_of_three) + insert_separator, 0, 1 + pynini.cross(plus, "+") + delete_space + (single_digits | group_of_two | group_of_three) + insert_separator, 0, 1 ) + ext_phrase = pynini.accep(" extensión ") + if input_case == INPUT_CASED: + ext_phrase = pynini.union(" extensión ", " Extensión ") # optionally denormalize extensions optional_extension = pynini.closure( - pynini.cross(" extensión ", " ext. ") + (single_digits | group_of_two | group_of_three), 0, 1 + pynini.cross(ext_phrase, " ext. ") + (single_digits | group_of_two | group_of_three), 0, 1 ) number_part = ( @@ -130,5 +156,8 @@ def __init__(self): number_part = pynutil.insert("number_part: \"") + number_part + pynutil.insert("\"") graph = number_part + if input_case == INPUT_CASED: + graph |= capitalized_input_graph(graph) + final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py index 6e57b5a77..55523ee47 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py @@ -16,12 +16,16 @@ import pynini from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, GraphFst, + capitalized_input_graph, convert_space, delete_extra_space, delete_space, insert_space, ) +from nemo_text_processing.text_normalization.es.graph_utils import ES_MINUS, ES_PLUS from pynini.lib import pynutil @@ -59,9 +63,12 @@ class TimeFst(GraphFst): so far the rules have not been added to the TimeFst tagger to process timezones (to keep the rules simple, and because timezones are not very often specified in Spanish.) + + Args: + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self): + def __init__(self, input_case: str = INPUT_LOWER_CASED): super().__init__(name="time", kind="classify") suffix_graph = pynini.string_file(get_abs_path("data/time/time_suffix.tsv")) @@ -81,7 +88,7 @@ def __init__(self): graph_teen, (graph_ties + pynutil.insert("0")), (graph_ties + pynutil.delete(" y ") + graph_digit), - ) + ).optimize() # note that graph_hour will start from 2 hours # "1 o'clock" will be treated differently because it @@ -89,24 +96,45 @@ def __init__(self): digits_2_to_23 = [str(digits) for digits in range(2, 24)] digits_1_to_59 = [str(digits) for digits in range(1, 60)] - graph_1oclock = pynini.cross("la una", "la 1") - graph_hour = pynini.cross("las ", "las ") + graph_1_to_100 @ pynini.union(*digits_2_to_23) + oneoclock = pynini.accep("la una") + article = pynini.accep("las ") + half = pynini.accep("media") + quarter = pynini.accep("cuarto") + and_graph = pynini.union("y", "con") + + if input_case == INPUT_CASED: + suffix_graph |= pynini.string_file(get_abs_path("data/time/time_suffix_cased.tsv")).optimize() + time_zones |= pynini.invert(pynini.string_file(get_abs_path("data/time/time_zone_cased.tsv"))).optimize() + graph_digit |= capitalized_input_graph(graph_digit).optimize() + graph_1_to_100 |= capitalized_input_graph(graph_1_to_100).optimize() + article |= pynini.accep("Las ").optimize() + half |= pynini.accep("Media").optimize() + quarter |= pynini.accep("Cuarto").optimize() + and_graph |= pynini.union("Y","Con").optimize() + + graph_1oclock = pynini.cross(oneoclock, "la 1") + if input_case == INPUT_CASED: + graph_1oclock |= pynini.cross(pynini.accep("la Una"), "la 1") + oneoclock_capitalized = pynini.union("La Una", "La una") + graph_1oclock |= pynini.cross(oneoclock_capitalized, "La 1").optimize() + + graph_hour = article + graph_1_to_100 @ pynini.union(*digits_2_to_23) graph_minute = graph_1_to_100 @ pynini.union(*digits_1_to_59) - graph_minute_verbose = pynini.cross("media", "30") | pynini.cross("cuarto", "15") + graph_minute_verbose = pynini.cross(half, "30") | pynini.cross(quarter, "15") final_graph_hour = pynutil.insert("hours: \"") + (graph_1oclock | graph_hour) + pynutil.insert("\"") final_graph_minute = ( pynutil.insert("minutes: \"") - + pynini.closure((pynutil.delete("y") | pynutil.delete("con")) + delete_space, 0, 1) + + pynini.closure(pynutil.delete(and_graph) + delete_space, 0, 1) + (graph_minute | graph_minute_verbose) + pynutil.insert("\"") - ) + ).optimize() # g m t más tres -> las 2:00 p.m. gmt+3 digits_1_to_23 = [str(digits) for digits in range(1, 24)] offset = graph_1_to_100 @ pynini.union(*digits_1_to_23) - sign = pynini.cross("más", "+") | pynini.cross("menos", "-") + sign = pynini.cross(ES_PLUS, "+") | pynini.cross(ES_MINUS, "-") full_offset = pynutil.delete(" ") + sign + pynutil.delete(" ") + offset graph_offset = pynini.closure(full_offset, 0, 1) graph_time_zones = pynini.accep(" ") + time_zones + graph_offset @@ -126,8 +154,8 @@ def __init__(self): ) # las nueve a eme (only convert on-the-hour times if they are followed by a suffix) - graph_1oclock_with_suffix = pynini.closure(pynini.accep("la "), 0, 1) + pynini.cross("una", "1") - graph_hour_with_suffix = pynini.closure(pynini.accep("las "), 0, 1) + graph_1_to_100 @ pynini.union( + graph_1oclock_with_suffix = pynini.closure(pynini.union("la ", "La "), 0, 1) + pynini.cross(pynini.union("una", "Unia"), "1") + graph_hour_with_suffix = pynini.closure(article, 0, 1) + graph_1_to_100 @ pynini.union( *digits_2_to_23 ) final_graph_hour_with_suffix = ( @@ -170,14 +198,14 @@ def __init__(self): + delete_extra_space + pynutil.insert("minutes: \"") + delete_space - + pynutil.delete("menos") + + pynutil.delete(ES_MINUS) + delete_space + pynini.union( - pynini.cross("cinco", "55"), - pynini.cross("diez", "50"), - pynini.cross("cuarto", "45"), - pynini.cross("veinte", "40"), - pynini.cross("veinticinco", "30"), + pynini.cross(pynini.union("cinco", "Cinco"), "55"), + pynini.cross(pynini.union("diez", "Diez"), "50"), + pynini.cross(pynini.union("cuarto", "Cuarto"), "45"), + pynini.cross(pynini.union("veinte", "Veinte"), "40"), + pynini.cross(pynini.union("veinticinco", "Veinticinco"), "35"), ) + pynutil.insert("\"") ) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py index 315c14c9d..118cee63d 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py @@ -71,27 +71,27 @@ def __init__( else: logger.info(f"Creating ClassifyFst grammars.") - cardinal = CardinalFst() + cardinal = CardinalFst(input_case=input_case) cardinal_graph = cardinal.fst - ordinal = OrdinalFst(cardinal) + ordinal = OrdinalFst(cardinal, input_case=input_case) ordinal_graph = ordinal.fst - decimal = DecimalFst(cardinal) + decimal = DecimalFst(cardinal, input_case=input_case) decimal_graph = decimal.fst - fraction = FractionFst(cardinal, ordinal) + fraction = FractionFst(cardinal, ordinal, input_case=input_case) fraction_graph = fraction.fst - measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction).fst - date_graph = DateFst(cardinal).fst + measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, input_case=input_case).fst + date_graph = DateFst(cardinal, input_case=input_case).fst word_graph = WordFst().fst - time_graph = TimeFst().fst - money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst - whitelist_graph = WhiteListFst(input_file=whitelist).fst + time_graph = TimeFst(input_case=input_case).fst + money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, input_case=input_case).fst + whitelist_graph = WhiteListFst(input_file=whitelist, input_case=input_case).fst punct_graph = PunctuationFst().fst - electronic_graph = ElectronicFst().fst - telephone_graph = TelephoneFst().fst + electronic_graph = ElectronicFst(input_case=input_case).fst + telephone_graph = TelephoneFst(input_case=input_case).fst classify = ( pynutil.add_weight(whitelist_graph, 1.01) @@ -104,7 +104,7 @@ def __init__( | pynutil.add_weight(ordinal_graph, 1.6) | pynutil.add_weight(money_graph, 1.6) | pynutil.add_weight(telephone_graph, 1.6) - | pynutil.add_weight(electronic_graph, 1.6) + | pynutil.add_weight(electronic_graph, 2.96) | pynutil.add_weight(word_graph, 100) ) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py index 2d31cede5..d2d6421fd 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py @@ -13,8 +13,15 @@ # limitations under the License. import pynini +import os from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, + GraphFst, + convert_space, +) +from nemo_text_processing.text_normalization.en.utils import load_labels from pynini.lib import pynutil @@ -27,16 +34,50 @@ class WhiteListFst(GraphFst): Whitelisted tokens are defined and loaded from "data/whitelist.tsv" (unless input_file specified). Args: + input_case: accepting either "lower_cased" or "cased" input. input_file: path to a file with whitelist replacements (each line of the file: written_form\tspoken_form\n), e.g. nemo_text_processing/inverse_text_normalization/es/data/whitelist.tsv """ - def __init__(self, input_file: str = None): + def __init__(self, input_case: str = INPUT_LOWER_CASED, input_file: str = None): super().__init__(name="whitelist", kind="classify") - if input_file: - whitelist = pynini.string_file(input_file).invert() - else: - whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert() + def get_whitelist_graph(input_file: str): + labels = load_labels(input_file) + + if input_case == INPUT_CASED: + additional_labels = [] + for written, spoken in labels: + written_capitalized = written[0].upper() + written[1:] + additional_labels.extend( + [ + [written_capitalized, spoken.capitalize()], # first letter capitalized + [ + written_capitalized, + spoken.upper().replace(" Y ", " y "), + ], # # add pairs with the all letters capitalized + ] + ) + + spoken_no_space = spoken.replace(" ", "") + # add abbreviations without spaces (both lower and upper case), i.e. "BMW" not "B M W" + if len(spoken) == (2 * len(spoken_no_space) - 1): + additional_labels.extend( + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]] + ) + + labels += additional_labels + + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + if input_file is None: + input_file = get_abs_path("data/whitelist.tsv") + + if not os.path.exists(input_file): + raise ValueError(f"Whitelist file {input_file} not found") + + whitelist = get_whitelist_graph(input_file) + graph = pynutil.insert("name: \"") + convert_space(whitelist) + pynutil.insert("\"") self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py index 234fdc296..397737380 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py @@ -13,6 +13,7 @@ # limitations under the License. import pynini +from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, @@ -31,6 +32,9 @@ class DateFst(GraphFst): def __init__(self): super().__init__(name="date", kind="verbalize") + graph_month = pynini.string_file(get_abs_path("data/dates/months.tsv")) + graph_month |= pynini.string_file(get_abs_path("data/dates/months_cased.tsv")) + year = ( pynutil.delete("year:") + delete_space @@ -42,7 +46,7 @@ def __init__(self): pynutil.delete("month:") + delete_space + pynutil.delete("\"") - + pynini.closure(NEMO_NOT_QUOTE, 1) + + graph_month + pynutil.delete("\"") ) day = ( diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py index 15b7ddbf1..ab7913b64 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py @@ -35,11 +35,13 @@ def __init__(self): add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) # hour may or may not include preposition ("la" or "las") + preposition = pynini.union("la ", "las ", "La ", "Las ") + hour = ( pynutil.delete("hours:") + delete_space + pynutil.delete("\"") - + pynini.closure(pynini.union("la ", "las "), 0, 1) + + pynini.closure(preposition, 0, 1) + pynini.closure(NEMO_DIGIT, 1) + pynutil.delete("\"") ) diff --git a/nemo_text_processing/text_normalization/es/graph_utils.py b/nemo_text_processing/text_normalization/es/graph_utils.py index f9f3dc22b..35d5b598d 100644 --- a/nemo_text_processing/text_normalization/es/graph_utils.py +++ b/nemo_text_processing/text_normalization/es/graph_utils.py @@ -39,6 +39,9 @@ fem_hundreds = hundreds @ pynini.cdrewrite(pynini.cross("ientos", "ientas"), "", "", NEMO_SIGMA) +ES_MINUS = pynini.union("menos", "Menos", "MENOS").optimize() +ES_PLUS = pynini.union("más", "Más", "MÁS").optimize() + def strip_accent(fst: 'pynini.FstLike') -> 'pynini.FstLike': """ Converts all accented vowels to non-accented equivalents diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_cardinal_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_cardinal_cased.txt new file mode 100644 index 000000000..15514ae12 --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_cardinal_cased.txt @@ -0,0 +1,30 @@ +Doscientos cincuenta y uno~251 +Novecientos noventa y nueve millones novecientos noventa y nueve mil novecientos noventa y nueve~999999999 +Cero~Cero +Uno~Uno +una~una +dos~dos +Nueve~Nueve +Diez~10 +, uno~, uno +, diez~, 10 +Menos veintitrés~-23 +cien~100 +ciento uno~101 +ciento un~101 +ciento una~101 +mil y uno~1001 +Mil una~1001 +nueve billones setecientos ochenta y nueve mil trescientos ochenta y dos millones quinientos treinta y seis mil ciento treinta~9789382536130 +Doscientos cincuenta y cuatro~254 +ciento cuarenta y siete mil cuatrocientos cincuenta y uno~147451 +Un Millón ciento cincuenta y seis mil ciento setenta y tres~1156173 +Mil quinientos noventa y tres millones setenta y dos mil novecientos sesenta y uno~1593072961 +noventa y siete mil ochocientos ocho billones doscientos sesenta y cuatro mil setecientos setenta y dos millones setecientos noventa y dos mil cinco~97808264772792005 +diecisiete mil ochocientos cincuenta y cinco trillones treinta y seis mil seiscientos cincuenta y siete billones siete mil quinientos noventa y seis millones ciento diez mil novecientos cuarenta y nueve~17855036657007596110949 +diez mil diez billones diez millones cien mil diez~10010000010100010 +Menos veinticinco mil treinta y siete~-25037 +mil doscientos sesenta y cuatro billones trescientos un mil novecientos treinta y ocho millones ciento cuatro~1264301938000104 +menos sesenta~-60 +cuarenta y seis mil seiscientos sesenta y cuatro~46664 +sesenta~60 \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_date_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_date_cased.txt new file mode 100644 index 000000000..98bfd6fe3 --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_date_cased.txt @@ -0,0 +1,8 @@ +Primero De Enero~1 de Enero +Uno de enero~1 de Enero +el uno de Diciembre~el 1 de Diciembre +El primero de diciembre~El 1 de diciembre +Domingo Veintiséis De Octubre~Domingo 26 de Octubre +treinta y uno de diciembre de mil novecientos noventa y dos~31 de diciembre de 1992 +Siglo diecinueve~Siglo xix +doscientos tres antes de Cristo~203 A. C. \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_decimal_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_decimal_cased.txt new file mode 100644 index 000000000..81a91bb3a --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_decimal_cased.txt @@ -0,0 +1,6 @@ +Uno coma dos seis~1,26 +Menos uno coma dos seis~-1,26 +Uno Coma Veintiséis~1,26 +Cero coma Dos seis~0,26 +cero coma veintiséis~0,26 +tres coma ciento cuarenta y uno~3,141 \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_electronic_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_electronic_cased.txt new file mode 100644 index 000000000..2d3f26b9d --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_electronic_cased.txt @@ -0,0 +1,5 @@ +A punto B C Arroba G mail punto com~A.BC@gmail.com +c d f Arroba a b c Punto e d u~cdf@abc.edu +W W W Punto N vidia Punto com~www.nvidia.com +Doble ve doble ve doble ve punto a b c punto es barra e f g~www.abc.es/efg +Doble Ve Doble Ve Doble Ve Punto a b c Punto e s~www.abc.es \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_measure.txt index 02895142e..6b80918b5 100644 --- a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_measure.txt @@ -13,7 +13,7 @@ cuatro segundos~4 s cinco litros~5 l tres metros cúbicos~3 m³ dos kilómetros por hora~2 kph -diez grados farenheit~10 ° f +diez grados farenheit~10 ° F dos metros y medio~2 1/2 m tres quintos de metro~3/5 m menos tres y medio metros por hora~-3 1/2 m/h diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_measure_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_measure_cased.txt new file mode 100644 index 000000000..ad28add7a --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_measure_cased.txt @@ -0,0 +1,11 @@ +Doscientos metros~200 m +tres horas~3 h +una hora~1 h +Doscientos cuarenta y cinco Millas Por Hora~245 mph +Dos Kilos~2 kg +sesenta coma dos cuatro cero cero kilogramos~60,2400 kg +Menos sesenta coma veinticuatro cero cero kilogramos~-60,2400 kg +menos Ocho Coma Cinco Dos por ciento~-8,52 % +uno Porciento~1 % +tres centímetros~3 cm +dos más dos es igual a cuatro~2 + 2 = 4 \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_money_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_money_cased.txt new file mode 100644 index 000000000..a57e6065a --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_money_cased.txt @@ -0,0 +1,6 @@ +doce dólares y cinco centavos~$12,05 +Doce Dólares Y Cinco Céntimos~$12,05 +setenta y cinco Dólares sesenta y tres~$75,63 +Veintinueve dólares cincuenta centavos~$29,50 +Catorce millones quinientos mil Pesos mexicanos~Mex$14500000 +diez pesos Mexicanos~Mex$10 \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_ordinal_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_ordinal_cased.txt new file mode 100644 index 000000000..0dd13fd54 --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_ordinal_cased.txt @@ -0,0 +1,11 @@ +primero~primero +Tercera~Tercera +Primer~Primer +tercer~tercer +Décima~10.ª +undécimo~11.º +Decimoprimer~11.ᵉʳ +Décimo primer~11.ᵉʳ +Décima Primera~11.ª +(technically ungrammatical) décimo primera~(technically ungrammatical) 11.ª +decimotercero~13.º \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_telephone_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_telephone_cased.txt new file mode 100644 index 000000000..068867d68 --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_telephone_cased.txt @@ -0,0 +1,6 @@ +Uno dos tres uno dos tres cinco seis siete ocho~123-123-5678 +uno veintitrés uno veintitrés cincuenta y seis setenta y ocho~123-123-5678 +Uno Dos Tres Cuatro Cinco Seis Siete Ocho Nueve~123-456-789 +Triple tres uno dos tres cinco seis siete ocho~333-123-5678 +Más uno uno dos tres uno dos tres cinco seis siete ocho~+1-123-123-5678 +más cincuenta y cuatro uno dos tres uno dos tres cinco seis siete ocho Extensión doce~+54-123-123-5678 ext. 12 \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_time.txt index 8ea4b35f1..e74a63fda 100644 --- a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_time.txt @@ -22,4 +22,4 @@ cuarto para las cero~las 23:45 cuarto para las veinticuatro~las 23:45 diez para las doce~las 11:50 dos y media de la tarde~2:30 p.m. -la una de la tarde u t c más cuatro~la 1:00 p.m. utc+4 +la una de la tarde u t c más cuatro~la 1:00 p.m. UTC+4 diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_time_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_time_cased.txt new file mode 100644 index 000000000..ba450d79b --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_time_cased.txt @@ -0,0 +1,9 @@ +las dieciséis cincuenta~las 16:50 +la una~la una +Las dos~Las dos +Las tres personas~Las tres personas +Las Dos a eme~Las 2:00 a.m. +la una Pe Eme~la 1:00 P.M. +la una y diez~la 1:10 +la una y Diez a eme~la 1:10 a.m. +La Una Y Diez pe eme~La 1:10 p.m. \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_whitelist.txt index f142f8954..d6aa3211c 100644 --- a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_whitelist.txt +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_whitelist.txt @@ -1,5 +1,5 @@ -usted~ud. -ustedes~uds. -habla usted español~habla ud. español -hablan ustedes español~hablan uds. español -estados unidos~ee. uu. \ No newline at end of file +usted~Ud. +ustedes~Uds. +habla usted español~habla Ud. español +hablan ustedes español~hablan Uds. español +estados unidos~EE. UU. \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_word_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_word_cased.txt new file mode 100644 index 000000000..38681016d --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_word_cased.txt @@ -0,0 +1,11 @@ +~ +Yahoo!~Yahoo! +Veinte!~20 ! +X ~X +—~— +AAA~AAA +Aabach~Aabach +aabenraa~aabenraa +Aachen's~Aachen's +aadri~aadri +aaliyan's~aaliyan's \ No newline at end of file diff --git a/tests/nemo_text_processing/es/test_cardinal.py b/tests/nemo_text_processing/es/test_cardinal.py index 748853db5..bce4c5ecd 100644 --- a/tests/nemo_text_processing/es/test_cardinal.py +++ b/tests/nemo_text_processing/es/test_cardinal.py @@ -23,13 +23,27 @@ class TestCardinal: - inverse_normalizer = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + # inverse_normalizer = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_cardinal.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) + assert pred == expected + + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_cardinal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) diff --git a/tests/nemo_text_processing/es/test_date.py b/tests/nemo_text_processing/es/test_date.py index 150fc23ed..7ed0a5ad5 100644 --- a/tests/nemo_text_processing/es/test_date.py +++ b/tests/nemo_text_processing/es/test_date.py @@ -23,6 +23,9 @@ class TestDate: inverse_normalizer = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_date.txt')) @pytest.mark.run_only_on('CPU') @@ -31,6 +34,16 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_cardinal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = NormalizerWithAudio( diff --git a/tests/nemo_text_processing/es/test_decimal.py b/tests/nemo_text_processing/es/test_decimal.py index 4e9585011..17a4b48f2 100644 --- a/tests/nemo_text_processing/es/test_decimal.py +++ b/tests/nemo_text_processing/es/test_decimal.py @@ -23,6 +23,9 @@ class TestDecimal: inverse_normalizer = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_decimal.txt')) @pytest.mark.run_only_on('CPU') @@ -31,6 +34,17 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_decimal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( diff --git a/tests/nemo_text_processing/es/test_electronic.py b/tests/nemo_text_processing/es/test_electronic.py index 7726e0d9f..1295e223b 100644 --- a/tests/nemo_text_processing/es/test_electronic.py +++ b/tests/nemo_text_processing/es/test_electronic.py @@ -23,6 +23,9 @@ class TestElectronic: inverse_normalizer_es = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_electronic.txt')) @pytest.mark.run_only_on('CPU') @@ -31,6 +34,19 @@ def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) assert pred == expected + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_electronic_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( diff --git a/tests/nemo_text_processing/es/test_fraction.py b/tests/nemo_text_processing/es/test_fraction.py index c0022c377..ec2b856b4 100644 --- a/tests/nemo_text_processing/es/test_fraction.py +++ b/tests/nemo_text_processing/es/test_fraction.py @@ -14,6 +14,7 @@ import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized @@ -22,6 +23,21 @@ class TestFraction: + inverse_normalizer = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_fraction.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( diff --git a/tests/nemo_text_processing/es/test_measure.py b/tests/nemo_text_processing/es/test_measure.py index 4474ea16e..e82d79b62 100644 --- a/tests/nemo_text_processing/es/test_measure.py +++ b/tests/nemo_text_processing/es/test_measure.py @@ -24,6 +24,9 @@ class TestMeasure: inverse_normalizer_es = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_measure.txt')) @pytest.mark.run_only_on('CPU') @@ -32,6 +35,18 @@ def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) assert pred == expected + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_measure_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( diff --git a/tests/nemo_text_processing/es/test_money.py b/tests/nemo_text_processing/es/test_money.py index 25f34d810..61253eb39 100644 --- a/tests/nemo_text_processing/es/test_money.py +++ b/tests/nemo_text_processing/es/test_money.py @@ -24,6 +24,9 @@ class TestMoney: inverse_normalizer = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_money.txt')) @pytest.mark.run_only_on('CPU') @@ -31,6 +34,18 @@ class TestMoney: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_money_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) diff --git a/tests/nemo_text_processing/es/test_ordinal.py b/tests/nemo_text_processing/es/test_ordinal.py index 5d98c3512..a13971150 100644 --- a/tests/nemo_text_processing/es/test_ordinal.py +++ b/tests/nemo_text_processing/es/test_ordinal.py @@ -24,6 +24,9 @@ class TestOrdinal: inverse_normalizer = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_ordinal.txt')) @pytest.mark.run_only_on('CPU') @@ -32,6 +35,17 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_ordinal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( NormalizerWithAudio(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) diff --git a/tests/nemo_text_processing/es/test_telephone.py b/tests/nemo_text_processing/es/test_telephone.py index 5a0159158..8728bad90 100644 --- a/tests/nemo_text_processing/es/test_telephone.py +++ b/tests/nemo_text_processing/es/test_telephone.py @@ -24,6 +24,9 @@ class TestTelephone: inverse_normalizer_es = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_telephone.txt')) @pytest.mark.run_only_on('CPU') @@ -31,6 +34,18 @@ class TestTelephone: def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) assert pred == expected + + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_telephone_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) diff --git a/tests/nemo_text_processing/es/test_time.py b/tests/nemo_text_processing/es/test_time.py index 86e1c9893..3a0c94f5d 100644 --- a/tests/nemo_text_processing/es/test_time.py +++ b/tests/nemo_text_processing/es/test_time.py @@ -23,6 +23,9 @@ class TestTime: inverse_normalizer_es = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_time.txt')) @pytest.mark.run_only_on('CPU') @@ -31,6 +34,16 @@ def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) assert pred == expected + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_time_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( diff --git a/tests/nemo_text_processing/es/test_word.py b/tests/nemo_text_processing/es/test_word.py index 87ab24070..21a8f5c54 100644 --- a/tests/nemo_text_processing/es/test_word.py +++ b/tests/nemo_text_processing/es/test_word.py @@ -23,6 +23,9 @@ class TestWord: inverse_normalizer_es = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_word.txt')) @pytest.mark.run_only_on('CPU') @@ -30,6 +33,18 @@ class TestWord: def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) assert pred == expected + + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_word_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + normalizer_es = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio_es = ( From 5fcafaeff1244d3e867c8a3320e35d84e46b54e9 Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Wed, 28 Feb 2024 13:17:40 -0800 Subject: [PATCH 02/18] enable capitalized itn for es_en Signed-off-by: Mariana Graterol Fuenmayor --- .../es_en/data/es_whitelist.tsv | 30 +++++----- .../es_en/taggers/tokenize_and_classify.py | 22 +++---- .../es_en/verbalizers/verbalize.py | 4 +- .../test_cases_cardinal_cased.txt | 60 +++++++++++++++++++ .../test_cases_date.txt | 2 +- .../test_cases_date_cased.txt | 20 +++++++ .../test_cases_decimal_cased.txt | 19 ++++++ .../test_cases_electronic_cased.txt | 14 +++++ .../test_cases_measure.txt | 2 +- .../test_cases_measure_cased.txt | 22 +++++++ .../test_cases_money_cased.txt | 16 +++++ .../test_cases_ordinal_cased.txt | 21 +++++++ .../test_cases_telephone_cased.txt | 19 ++++++ .../test_cases_time.txt | 36 +++++------ .../test_cases_time_cased.txt | 19 ++++++ .../test_cases_whitelist.txt | 10 ++-- .../test_cases_word_cased.txt | 17 ++++++ .../es_en/test_cardinal.py | 11 +++- tests/nemo_text_processing/es_en/test_date.py | 10 ++++ .../es_en/test_decimal.py | 10 ++++ .../es_en/test_electronic.py | 14 ++++- .../es_en/test_measure.py | 14 ++++- .../nemo_text_processing/es_en/test_money.py | 12 +++- .../es_en/test_ordinal.py | 12 +++- .../es_en/test_telephone.py | 14 ++++- tests/nemo_text_processing/es_en/test_time.py | 14 ++++- tests/nemo_text_processing/es_en/test_word.py | 14 ++++- 27 files changed, 393 insertions(+), 65 deletions(-) create mode 100644 tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_cardinal_cased.txt create mode 100644 tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_date_cased.txt create mode 100644 tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_decimal_cased.txt create mode 100644 tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_electronic_cased.txt create mode 100644 tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_measure_cased.txt create mode 100644 tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_money_cased.txt create mode 100644 tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_ordinal_cased.txt create mode 100644 tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_telephone_cased.txt create mode 100644 tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_time_cased.txt create mode 100644 tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_word_cased.txt diff --git a/nemo_text_processing/inverse_text_normalization/es_en/data/es_whitelist.tsv b/nemo_text_processing/inverse_text_normalization/es_en/data/es_whitelist.tsv index 60253820a..cfa739c1a 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/data/es_whitelist.tsv +++ b/nemo_text_processing/inverse_text_normalization/es_en/data/es_whitelist.tsv @@ -1,16 +1,16 @@ -ud. usted -uds. ustedes -vd. vosotros -vds. vosotros -dr. doctor -dra. doctora -d. don -da. doña -ee. uu. estados unidos +Ud. usted +Uds. ustedes +Vd. vosotros +Vds. vosotros +Dr. doctor +Dra. doctora +D. don +Da. doña +EE. UU. estados unidos p.ej. por ejemplo -prof. profesor -profa. profesora -sr. señor -sra. señora -srta. señorita -etc. etcétera +Prof. profesor +Profa. profesora +Sr. señor +Sra. señora +Srta. señorita +etc. etcétera \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py index abf54edb6..c43a6ad3f 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py @@ -89,27 +89,27 @@ def __init__( else: logger.info(f"Creating ClassifyFst grammars.") - cardinal = CardinalFst() + cardinal = CardinalFst(input_case=input_case) cardinal_graph = cardinal.fst - ordinal = OrdinalFst(cardinal) + ordinal = OrdinalFst(cardinal, input_case=input_case) ordinal_graph = ordinal.fst - decimal = DecimalFst(cardinal) + decimal = DecimalFst(cardinal, input_case=input_case) decimal_graph = decimal.fst - fraction = FractionFst(cardinal, ordinal) + fraction = FractionFst(cardinal, ordinal, input_case=input_case) fraction_graph = fraction.fst - measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction).fst - date_graph = DateFst(cardinal).fst + measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, input_case=input_case).fst + date_graph = DateFst(cardinal, input_case=input_case).fst word_graph = WordFst().fst - time_graph = TimeFst().fst - money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst + time_graph = TimeFst(input_case=input_case).fst + money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, input_case=input_case).fst whitelist_graph = WhiteListFst(input_file=whitelist).fst punct_graph = PunctuationFst().fst - electronic_graph = ElectronicFst().fst - telephone_graph = TelephoneFst().fst + electronic_graph = ElectronicFst(input_case=input_case).fst + telephone_graph = TelephoneFst(input_case=input_case).fst en_cardinal = EnCardinalFst(input_case=input_case) en_cardinal_graph = en_cardinal.fst @@ -150,7 +150,7 @@ def __init__( | pynutil.add_weight(en_money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.6) | pynutil.add_weight(en_telephone_graph, 1.1) - | pynutil.add_weight(electronic_graph, 1.6) + | pynutil.add_weight(electronic_graph, 2.3) | pynutil.add_weight(en_electronic_graph, 1.1) | pynutil.add_weight(word_graph, 100) | pynutil.add_weight(en_word_graph, 120) diff --git a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py index 8e540a616..3eb7ba3a6 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py @@ -74,10 +74,12 @@ def __init__(self): en_date_graph = EnDateFst().fst en_whitelist_graph = EnWhiteListFst().fst en_telephone_graph = EnTelephoneFst().fst + en_time_graph = EnTimeFst().fst en_electronic_graph = EnElectronicFst().fst graph = ( - time_graph + en_time_graph + | pynutil.add_weight(time_graph, 1.1) | date_graph | pynutil.add_weight(en_date_graph, 1.1) | money_graph diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_cardinal_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_cardinal_cased.txt new file mode 100644 index 000000000..d6fc7a85e --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_cardinal_cased.txt @@ -0,0 +1,60 @@ +Doscientos cincuenta y uno~251 +Novecientos noventa y nueve millones novecientos noventa y nueve mil novecientos noventa y nueve~999999999 +Cero~Cero +Uno~Uno +una~una +dos~dos +Nueve~Nueve +Diez~10 +, uno~, uno +, diez~, 10 +Menos veintitrés~-23 +cien~100 +ciento uno~101 +ciento un~101 +ciento una~101 +mil y uno~1001 +Mil una~1001 +nueve billones setecientos ochenta y nueve mil trescientos ochenta y dos millones quinientos treinta y seis mil ciento treinta~9789382536130 +Doscientos cincuenta y cuatro~254 +ciento cuarenta y siete mil cuatrocientos cincuenta y uno~147451 +Un Millón ciento cincuenta y seis mil ciento setenta y tres~1156173 +Mil quinientos noventa y tres millones setenta y dos mil novecientos sesenta y uno~1593072961 +noventa y siete mil ochocientos ocho billones doscientos sesenta y cuatro mil setecientos setenta y dos millones setecientos noventa y dos mil cinco~97808264772792005 +diecisiete mil ochocientos cincuenta y cinco trillones treinta y seis mil seiscientos cincuenta y siete billones siete mil quinientos noventa y seis millones ciento diez mil novecientos cuarenta y nueve~17855036657007596110949 +diez mil diez billones diez millones cien mil diez~10010000010100010 +Menos veinticinco mil treinta y siete~-25037 +mil doscientos sesenta y cuatro billones trescientos un mil novecientos treinta y ocho millones ciento cuatro~1264301938000104 +menos sesenta~-60 +cuarenta y seis mil seiscientos sesenta y cuatro~46664 +sesenta~60 +Nine trillion seven hundred eighty nine billion three hundred eighty two million five hundred thirty six thousand one hundred thirty~9789382536130 +Two hundred and fifty four~254 +One hundred forty seven thousand four hundred fifty one~147451 +One million one hundred fifty six thousand one hundred seventy three~1156173 +One billion five hundred ninety three million seventy two thousand nine hundred sixty one~1593072961 +Ninety seven quadrillion eight hundred eight trillion two hundred sixty four billion seven hundred seventy two million seven hundred ninety two thousand five~97808264772792005 +Seventeen sextillion eight hundred fifty five quintillion thirty six quadrillion six hundred fifty seven trillion seven billion five hundred ninety six million one hundred ten thousand nine hundred forty nine~17855036657007596110949 +Ten quadrillion ten trillion ten million one hundred thousand ten~10010000010100010 +Minus twenty five thousand thirty seven~-25037 +One quadrillion two hundred sixty four trillion three hundred one billion nine hundred thirty eight million one hundred four~1264301938000104 +Minus sixty~-60 +Forty six thousand six hundred sixty four~46664 +Sixty~60 +Zero~Zero +Two million three~2000003 +One thousand thirteen~1013 +One thousand one~1001 +One thousand one hundred~1100 +One thousand twenty six~1026 +One thousand one hundred twenty six~1126 +Eighteen million four hundred fifty thousand nine hundred ninety~18450990 +Eighteen million nine hundred forty thousand seven hundred twenty two~18940722 +Eighteen million six hundred ninety thousand nine hundred sixteen~18690916 +Eighteen thousand eight hundred eighty~18880 +Eleven hundred~1100 +Twenty one hundred~2100 +Twenty one hundred and eleven~2111 +Eleven hundred twenty one~1121 +Nineteen~19 +Twelve~Twelve \ No newline at end of file diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_date.txt index f961fbfbb..1147a12c7 100644 --- a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_date.txt @@ -13,7 +13,7 @@ two thousand and nine~2009 the twenty fifth of july twenty twelve~25 july 2012 the twenty fifth of july two thousand twelve~25 july 2012 the twenty second of july twenty twelve~22 july 2012 -the fifteenth of january~15 de january +the fifteenth of january~15 january the seventeenth of may twenty ten~17 may 2010 january first~january 1 july twenty second two thousand eight~july 22 2008 diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_date_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_date_cased.txt new file mode 100644 index 000000000..ab8358dd6 --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_date_cased.txt @@ -0,0 +1,20 @@ +Primero De Enero~1 de Enero +Uno de enero~1 de Enero +el uno de Diciembre~el 1 de Diciembre +El primero de diciembre~El 1 de diciembre +Domingo Veintiséis De Octubre~Domingo 26 de Octubre +treinta y uno de diciembre de mil novecientos noventa y dos~31 de diciembre de 1992 +Siglo diecinueve~Siglo xix +doscientos tres antes de Cristo~203 A. C. +January first~January 1 +July twenty second two thousand eight~July 22 2008 +June thirty~June 30 +July twenty fifth twenty twelve~July 25 2012 +Nineteen seventeen~1917 +Twenty twelve~2012 +March sixteen sixty five~March 1665 +Sixteen sixty five~1665 +July two thousand twelve~July 2012 +October nineteen oh five~October 1905 +July fifteen o six~July 1506 +The twenty fifth of july twenty twelve~25 july 2012 \ No newline at end of file diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_decimal_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_decimal_cased.txt new file mode 100644 index 000000000..17b91271d --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_decimal_cased.txt @@ -0,0 +1,19 @@ +Uno coma dos seis~1,26 +Menos uno coma dos seis~-1,26 +Uno Coma Veintiséis~1,26 +Cero coma Dos seis~0,26 +cero coma veintiséis~0,26 +tres coma ciento cuarenta y uno~3,141 +One point two five billion~1.25 billion +Thirteen billion~13 billion +Thirty billion~30 billion +Thirty Billion~30 Billion +Two thousand eight hundred five point eight seven three billion~2805.873 billion +Eighteen~18 +Eighteen point eight five~18.85 +Eighteen point five o~18.50 +Eighteen point five six~18.56 +Eighteen point nine~18.9 +Eighteen point o five~18.05 +Eighteen point one two~18.12 +Eighteen point o one~18.01 diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_electronic_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_electronic_cased.txt new file mode 100644 index 000000000..9131f4f94 --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_electronic_cased.txt @@ -0,0 +1,14 @@ +A punto B C Arroba G mail punto com~A.BC@gmail.com +c d f Arroba a b c Punto e d u~cdf@abc.edu +W W W Punto N vidia Punto com~www.nvidia.com +Doble ve doble ve doble ve punto a b c punto es barra e f g~www.abc.es/efg +Doble Ve Doble Ve Doble Ve Punto a b c Punto e s~www.abc.es +N vidia dot com~nvidia.com +Abc at gmail dot com~Abc@gmail.com +Athreed at gmail dot com~Athreed@gmail.com +Kore dot ai~Kore.ai +NVIDIA dot com~NVIDIA.com +NVIDIA dot COM~NVIDIA.COM +WWW.A B C at A B C dot com~WWW.A BC@ABC.com +W W W. A B C dot com~www.ABC.com +w w w . o u r d a i l y n e w s dot com . s m~www.ourdailynews.com . s m \ No newline at end of file diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_measure.txt index 0a9b90903..01278697f 100644 --- a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_measure.txt @@ -13,7 +13,7 @@ cuatro segundos~4 s cinco litros~5 l tres metros cúbicos~3 m³ dos kilómetros por hora~2 kph -diez grados farenheit~10 ° f +diez grados farenheit~10 ° F dos metros y medio~2 1/2 m tres quintos de metro~3/5 m menos tres y medio metros por hora~-3 1/2 m/h diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_measure_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_measure_cased.txt new file mode 100644 index 000000000..7073a7bf6 --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_measure_cased.txt @@ -0,0 +1,22 @@ +Doscientos metros~200 m +tres horas~3 h +una hora~1 h +Doscientos cuarenta y cinco Millas Por Hora~245 mph +Dos Kilos~2 kg +sesenta coma dos cuatro cero cero kilogramos~60,2400 kg +Menos sesenta coma veinticuatro cero cero kilogramos~-60,2400 kg +menos Ocho Coma Cinco Dos por ciento~-8,52 % +uno Porciento~1 % +tres centímetros~3 cm +dos más dos es igual a cuatro~2 + 2 = 4 +Eight point five megawatts~8.5 mW +Eight point five meters~8.5 m +Eight point five two percent~8.52 % +Eight point four four percent~8.44 % +one gigabit per second~1 gbps +nine gigabits per second~9 gbps +five degrees celsius~5 °C +seventy two degrees fahrenheit~72 °F +Seventy two Degrees Fahrenheit~72 °F +two hundred seventy three kelvin~273 K +Nine GigaBits per second~9 gbps \ No newline at end of file diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_money_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_money_cased.txt new file mode 100644 index 000000000..613a822a6 --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_money_cased.txt @@ -0,0 +1,16 @@ +doce dólares y cinco centavos~$12,05 +Doce Dólares Y Cinco Céntimos~$12,05 +setenta y cinco Dólares sesenta y tres~$75,63 +Veintinueve dólares cincuenta centavos~$29,50 +Catorce millones quinientos mil Pesos mexicanos~Mex$14500000 +diez pesos Mexicanos~Mex$10 +Two dollars~$2 +One cent~$0.01 +Four united states dollars and sixty nine cents~$4.69 +Seventy five dollars sixty three~$75.63 +Twenty nine dollars fifty cents~$29.50 +Eleven dollars and fifty one cents~$11.51 +Nine hundred ninety three dollars and ninety two cents~$993.92 +Four hundred sixty billion won~₩460 billion +Thirty billion yen~¥30 billion +Two point five billion dollars~$2.5 billion diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_ordinal_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_ordinal_cased.txt new file mode 100644 index 000000000..7b7df1ed7 --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_ordinal_cased.txt @@ -0,0 +1,21 @@ +primero~primero +Tercera~Tercera +Primer~Primer +tercer~tercer +Décima~10.ª +undécimo~11.º +Decimoprimer~11.ᵉʳ +Décimo primer~11.ᵉʳ +Décima Primera~11.ª +(technically ungrammatical) décimo primera~(technically ungrammatical) 11.ª +decimotercero~13.º +One hundredth~100th +Twenty five thousand one hundred eleventh~25111th +Second~2nd +Zeroth~0th +First~1st +Second~2nd +Third~3rd +Fourth~4th +Eleventh~11th +Twelfth~12th diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_telephone_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_telephone_cased.txt new file mode 100644 index 000000000..6b9613da0 --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_telephone_cased.txt @@ -0,0 +1,19 @@ +Uno dos tres uno dos tres cinco seis siete ocho~123-123-5678 +uno veintitrés uno veintitrés cincuenta y seis setenta y ocho~123-123-5678 +Uno Dos Tres Cuatro Cinco Seis Siete Ocho Nueve~123-456-789 +Triple tres uno dos tres cinco seis siete ocho~333-123-5678 +Más uno uno dos tres uno dos tres cinco seis siete ocho~+1-123-123-5678 +más cincuenta y cuatro uno dos tres uno dos tres cinco seis siete ocho Extensión doce~+54-123-123-5678 ext. 12 +One two three one two three five six seven eight~123-123-5678 +Plus nine one one two three one two three five six seven eight~+91 123-123-5678 +Plus forty four one two three one two three five six seven eight~+44 123-123-5678 +O two three one two three five six seven eight~023-123-5678 +Oh two three one two three five six seven eight~023-123-5678 +Double oh three one two three five six seven eight~003-123-5678 +Two two five dot double five dot o dot four o~225.55.0.40 +Two two five dot double five dot o dot forty five~225.55.0.45 +SSN is seven double nine one two three double one three~SSN is 799-12-3113 +Seven nine nine~799 +A b nine~Ab9 +A b c~A b c +Five w k r a three one~5wkra31 diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_time.txt index e12512e3a..17113ad98 100644 --- a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_time.txt @@ -21,23 +21,23 @@ las dos de la tarde~las 2:00 p.m. cuarto para las cero~las 23:45 cuarto para las veinticuatro~las 23:45 diez para las doce~las 11:50 -dos y media de la tarde~2:30 p.m. -la una de la tarde u t c más cuatro~la 1:00 p.m. utc+4 -eight oclock g m t~8:00 gmt -seven a m e s t~7:00 a.m. est -two p m~2:00 p.m. -two thirty~2:30 -three o'clock~3:00 -quarter past one~1:15 -half past three~3:30 -eight fifty one~8:51 -eight fifty two~8:52 -eight forty~8:40 -eight nineteen~8:19 -eight o six~8:06 -eight thirty eight~8:38 -eight thirty two~8:32 -eight twenty nine~8:29 +dos y media de la tarde~02:30 p.m. +la una de la tarde u t c más cuatro~la 1:00 p.m. UTC+4 +eight oclock g m t~08:00 gmt +seven a m e s t~07:00 a.m. est +two p m~02:00 p.m. +two thirty~02:30 +three o'clock~03:00 +quarter past one~01:15 +half past three~03:30 +eight fifty one~08:51 +eight fifty two~08:52 +eight forty~08:40 +eight nineteen~08:19 +eight o six~08:06 +eight thirty eight~08:38 +eight thirty two~08:32 +eight twenty nine~08:29 eleven fifty five p m~11:55 p.m. eleven fifty three p m~11:53 p.m. eleven forty a m~11:40 a.m. @@ -47,7 +47,7 @@ eleven forty six a m~11:46 a.m. eleven o six p m~11:06 p.m. eleven thirteen a m~11:13 a.m. half past twelve~12:30 -quarter past one~1:15 +quarter past one~01:15 quarter to one~12:45 quarter to twelve~11:45 set alarm at ten to eleven pm~set alarm at 10:50 p.m. diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_time_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_time_cased.txt new file mode 100644 index 000000000..875ff97d6 --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_time_cased.txt @@ -0,0 +1,19 @@ +las dieciséis cincuenta~las 16:50 +la una~la una +Las dos~Las dos +Las tres personas~Las tres personas +Las Dos a eme~Las 2:00 a.m. +la una Pe Eme~la 1:00 P.M. +la una y diez~la 1:10 +la una y Diez a eme~la 1:10 a.m. +La Una Y Diez pe eme~La 1:10 p.m. +Eight oclock g m t~08:00 gmt +Seven a m e s t~07:00 a.m. est +Two p m~02:00 p.m. +Seven A M E S T~07:00 A.M. EST +Two P M~02:00 P.M. +Two thirty~02:30 +Set alarm at ten to eleven pm~Set alarm at 10:50 p.m. +One min to one am~12:59 a.m. +eleven Forty six A M~11:46 A.M. +eleven forty six AM~11:46 A.M. \ No newline at end of file diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_whitelist.txt index 895fb52c2..90c024aa1 100644 --- a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_whitelist.txt +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_whitelist.txt @@ -1,8 +1,8 @@ -usted~ud. -ustedes~uds. -habla usted español~habla ud. español -hablan ustedes español~hablan uds. español -estados unidos~ee. uu. +usted~Ud. +ustedes~Uds. +habla usted español~habla Ud. español +hablan ustedes español~hablan Uds. español +estados unidos~EE. UU. doctor dao~dr. dao misses smith~mrs. smith mister dao~mr. dao diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_word_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_word_cased.txt new file mode 100644 index 000000000..ebe0c0864 --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_word_cased.txt @@ -0,0 +1,17 @@ +~ +Yahoo!~Yahoo! +Veinte!~20 ! +X ~X +—~— +AAA~AAA +Aabach~Aabach +aabenraa~aabenraa +Aachen's~Aachen's +Aadri~Aadri +Aaliyan's~Aaliyan's +Aahar~Aahar +Aahh~Aahh +Aahperd~Aahperd +Aaibinterstate~Aaibinterstate +Aajab~Aajab +Aakasa~Aakasa \ No newline at end of file diff --git a/tests/nemo_text_processing/es_en/test_cardinal.py b/tests/nemo_text_processing/es_en/test_cardinal.py index 02c623fb6..d0b26b874 100644 --- a/tests/nemo_text_processing/es_en/test_cardinal.py +++ b/tests/nemo_text_processing/es_en/test_cardinal.py @@ -20,8 +20,10 @@ class TestCardinal: - inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_cardinal.txt')) @pytest.mark.run_only_on('CPU') @@ -29,3 +31,10 @@ class TestCardinal: def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_cardinal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_date.py b/tests/nemo_text_processing/es_en/test_date.py index 0cac5dabd..efcccc79e 100644 --- a/tests/nemo_text_processing/es_en/test_date.py +++ b/tests/nemo_text_processing/es_en/test_date.py @@ -21,6 +21,9 @@ class TestDate: inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_date.txt')) @pytest.mark.run_only_on('CPU') @@ -28,3 +31,10 @@ class TestDate: def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_date_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_decimal.py b/tests/nemo_text_processing/es_en/test_decimal.py index 01c9fcf4c..fd63b034a 100644 --- a/tests/nemo_text_processing/es_en/test_decimal.py +++ b/tests/nemo_text_processing/es_en/test_decimal.py @@ -21,6 +21,9 @@ class TestDecimal: inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_decimal.txt')) @pytest.mark.run_only_on('CPU') @@ -28,3 +31,10 @@ class TestDecimal: def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_decimal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_electronic.py b/tests/nemo_text_processing/es_en/test_electronic.py index d3c4a921b..10212e6ba 100644 --- a/tests/nemo_text_processing/es_en/test_electronic.py +++ b/tests/nemo_text_processing/es_en/test_electronic.py @@ -20,11 +20,21 @@ class TestElectronic: - inverse_normalizer_es = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_electronic.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm_es(self, test_input, expected): - pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_electronic_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_measure.py b/tests/nemo_text_processing/es_en/test_measure.py index 56defb8a8..eff6f3209 100644 --- a/tests/nemo_text_processing/es_en/test_measure.py +++ b/tests/nemo_text_processing/es_en/test_measure.py @@ -21,11 +21,21 @@ class TestMeasure: - inverse_normalizer_es = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_measure.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm_es(self, test_input, expected): - pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_measure_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_money.py b/tests/nemo_text_processing/es_en/test_money.py index 3d3e75656..26c09c90c 100644 --- a/tests/nemo_text_processing/es_en/test_money.py +++ b/tests/nemo_text_processing/es_en/test_money.py @@ -22,10 +22,20 @@ class TestMoney: inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_money.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit - def test_denorm(self, test_input, expected): + def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_money_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_ordinal.py b/tests/nemo_text_processing/es_en/test_ordinal.py index 0b4a9cad7..9f7c37799 100644 --- a/tests/nemo_text_processing/es_en/test_ordinal.py +++ b/tests/nemo_text_processing/es_en/test_ordinal.py @@ -22,10 +22,20 @@ class TestOrdinal: inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_ordinal.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit - def test_denorm(self, test_input, expected): + def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_ordinal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_telephone.py b/tests/nemo_text_processing/es_en/test_telephone.py index 4b86eeb94..ad0360782 100644 --- a/tests/nemo_text_processing/es_en/test_telephone.py +++ b/tests/nemo_text_processing/es_en/test_telephone.py @@ -21,11 +21,21 @@ class TestTelephone: - inverse_normalizer_es = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_telephone.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm_es(self, test_input, expected): - pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_telephone_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_time.py b/tests/nemo_text_processing/es_en/test_time.py index 1fbba3d90..069068b1f 100644 --- a/tests/nemo_text_processing/es_en/test_time.py +++ b/tests/nemo_text_processing/es_en/test_time.py @@ -20,11 +20,21 @@ class TestTime: - inverse_normalizer_es = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_time.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm_es(self, test_input, expected): - pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_time_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_word.py b/tests/nemo_text_processing/es_en/test_word.py index 3cd465165..8404926bd 100644 --- a/tests/nemo_text_processing/es_en/test_word.py +++ b/tests/nemo_text_processing/es_en/test_word.py @@ -20,11 +20,21 @@ class TestWord: - inverse_normalizer_es = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_word.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm_es(self, test_input, expected): - pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_word_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) assert pred == expected From 64fd6c7ea6201a8913b8e167e06ae8334d22698a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 28 Feb 2024 21:23:41 +0000 Subject: [PATCH 03/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../es/taggers/cardinal.py | 3 -- .../es/taggers/date.py | 1 - .../es/taggers/decimal.py | 19 ++++---- .../es/taggers/electronic.py | 47 ++++++++----------- .../es/taggers/fraction.py | 1 - .../es/taggers/measure.py | 1 - .../es/taggers/money.py | 13 +++-- .../es/taggers/ordinal.py | 3 +- .../es/taggers/telephone.py | 8 +++- .../es/taggers/time.py | 11 ++--- .../es/taggers/tokenize_and_classify.py | 4 +- .../es/taggers/whitelist.py | 6 ++- .../es/verbalizers/date.py | 11 ++--- .../es_en/taggers/tokenize_and_classify.py | 4 +- .../text_normalization/es/graph_utils.py | 1 + .../nemo_text_processing/es/test_cardinal.py | 2 +- tests/nemo_text_processing/es/test_date.py | 2 +- tests/nemo_text_processing/es/test_decimal.py | 1 - .../es/test_electronic.py | 3 -- .../nemo_text_processing/es/test_fraction.py | 3 +- tests/nemo_text_processing/es/test_measure.py | 2 - tests/nemo_text_processing/es/test_money.py | 4 +- tests/nemo_text_processing/es/test_ordinal.py | 1 - .../nemo_text_processing/es/test_telephone.py | 4 +- tests/nemo_text_processing/es/test_word.py | 4 +- 25 files changed, 69 insertions(+), 90 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py index 2769880d0..edb6081d0 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py @@ -27,7 +27,6 @@ delete_space, ) from nemo_text_processing.text_normalization.es.graph_utils import ES_MINUS -from pynini.lib import pynutil class CardinalFst(GraphFst): @@ -178,7 +177,6 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize() self.numbers_up_to_million = numbers_up_to_million.optimize() - if input_case == INPUT_CASED: graph |= capitalized_input_graph(graph) graph_digit |= capitalized_input_graph(graph_digit) @@ -201,7 +199,6 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() - def delete_word(self, word: str): """ Capitalizes word for `cased` input""" delete_graph = pynutil.delete(word).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/date.py b/nemo_text_processing/inverse_text_normalization/es/taggers/date.py index 709a11f3d..b138f8a12 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/date.py @@ -25,7 +25,6 @@ delete_extra_space, delete_space, ) -from pynini.lib import pynutil class DateFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py index 2a81da751..cffbc8768 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py @@ -29,12 +29,11 @@ delete_space, ) from nemo_text_processing.text_normalization.es.graph_utils import ES_MINUS -from pynini.lib import pynutil def get_quantity( decimal: 'pynini.FstLike', cardinal_up_to_million: 'pynini.FstLike', input_case: str = INPUT_LOWER_CASED - ) -> 'pynini.FstLike': +) -> 'pynini.FstLike': """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. one million -> integer_part: "1" quantity: "million" @@ -115,7 +114,6 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): decimal_point |= pynini.cross("Coma", "morphosyntactic_features: \",\"") decimal_point |= pynini.cross("Punto", "morphosyntactic_features: \".\"") - optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross(ES_MINUS, "\"true\"") + delete_extra_space, 0, 1 ) @@ -132,22 +130,21 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): ) final_graph = optional_graph_negative + final_graph_wo_sign - self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( - final_graph_wo_sign, cardinal.numbers_up_to_million, input_case=input_case - ).optimize() - + self.final_graph_wo_negative = ( + final_graph_wo_sign + | get_quantity(final_graph_wo_sign, cardinal.numbers_up_to_million, input_case=input_case).optimize() + ) + # accept semiotic spans that start with a capital letter self.final_graph_wo_negative |= pynutil.add_weight( pynini.compose(TO_LOWER + NEMO_SIGMA, self.final_graph_wo_negative), MIN_NEG_WEIGHT ).optimize() - quantity_graph = get_quantity( - final_graph_wo_sign, cardinal.numbers_up_to_million, input_case=input_case - ) + quantity_graph = get_quantity(final_graph_wo_sign, cardinal.numbers_up_to_million, input_case=input_case) final_graph |= optional_graph_negative + quantity_graph if input_case == INPUT_CASED: final_graph |= capitalized_input_graph(final_graph) - + final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py index 0b4e63da4..3bc6a8b6d 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.utils import get_various_formats from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( @@ -25,7 +27,6 @@ insert_space, ) from nemo_text_processing.text_normalization.en.utils import load_labels -from pynini.lib import pynutil class ElectronicFst(GraphFst): @@ -78,11 +79,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): server_names = pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).invert() if input_case == INPUT_CASED: server_names = capitalized_input_graph(server_names) - server = ( - single_alphanum - | server_names - | pynini.closure(NEMO_ALPHA, 2) - ) + server = single_alphanum | server_names | pynini.closure(NEMO_ALPHA, 2) if input_case == INPUT_CASED: domain = [] @@ -102,48 +99,42 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): + domain + pynutil.insert("\"") ) - + at = pynini.accep("arroba") if input_case == INPUT_CASED: at |= pynini.accep("Arroba") - graph = ( - username + delete_extra_space + pynutil.delete(at) + insert_space + delete_extra_space + domain_graph - ) + graph = username + delete_extra_space + pynutil.delete(at) + insert_space + delete_extra_space + domain_graph ############# url ### if input_case == INPUT_CASED: spoken_ws = pynini.union( - "doble ve doble ve doble ve", - "Doble Ve Doble Ve Doble Ve", - "Doble ve doble ve doble ve" + "doble ve doble ve doble ve", "Doble Ve Doble Ve Doble Ve", "Doble ve doble ve doble ve" ) protocol_end = pynini.cross(pynini.union(*get_various_formats("www")) | spoken_ws, "www") - spoken_http = pynini.union( - "hache te te pe", - "Hache te te pe", - "Hache Te Te Pe" + spoken_http = pynini.union("hache te te pe", "Hache te te pe", "Hache Te Te Pe") + spoken_https = pynini.union("hache te te pe ese", "Hache te te pe ese", "Hache Te Te Pe Ese") + protocol_start = pynini.cross( + pynini.union(*get_various_formats("http")) | spoken_http, "http" + ) | pynini.cross(pynini.union(*get_various_formats("https")) | spoken_https, "https") + else: + protocol_end = pynutil.add_weight( + pynini.cross(pynini.union("www", "w w w", "doble ve doble ve doble ve"), "www"), MIN_POS_WEIGHT ) - spoken_https = pynini.union( - "hache te te pe ese", - "Hache te te pe ese", - "Hache Te Te Pe Ese" + protocol_start = pynutil.add_weight( + pynini.cross(pynini.union("http", "h t t p", "hache te te pe"), "http"), MIN_POS_WEIGHT ) - protocol_start = pynini.cross(pynini.union(*get_various_formats("http")) | spoken_http, "http") | pynini.cross( - pynini.union(*get_various_formats("https")) | spoken_https, "https" + protocol_start |= pynutil.add_weight( + pynini.cross(pynini.union("https", "h t t p s", "hache te te pe ese"), "https"), MIN_POS_WEIGHT ) - else: - protocol_end = pynutil.add_weight(pynini.cross(pynini.union("www", "w w w", "doble ve doble ve doble ve"), "www"), MIN_POS_WEIGHT) - protocol_start = pynutil.add_weight(pynini.cross(pynini.union("http", "h t t p", "hache te te pe"), "http"), MIN_POS_WEIGHT) - protocol_start |= pynutil.add_weight(pynini.cross(pynini.union("https", "h t t p s", "hache te te pe ese"), "https"), MIN_POS_WEIGHT) protocol_start += pynini.cross(" dos puntos barra barra ", "://") # e.g. .com, .es ending = ( delete_extra_space - + symbols + + symbols + delete_extra_space + (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username) ) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py index d856bfc37..885a6c233 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py @@ -19,7 +19,6 @@ from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import INPUT_LOWER_CASED, NEMO_SIGMA, NEMO_SPACE, GraphFst from nemo_text_processing.text_normalization.es.graph_utils import ES_MINUS -from pynini.lib import pynutil class FractionFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py index f9091c352..0377b8c00 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py @@ -28,7 +28,6 @@ delete_space, ) from nemo_text_processing.text_normalization.es.graph_utils import ES_MINUS -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/money.py b/nemo_text_processing/inverse_text_normalization/es/taggers/money.py index 9f1dd658a..a65d9bcd2 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/money.py @@ -60,11 +60,15 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPU if input_case == INPUT_CASED: unit_singular = capitalized_input_graph(unit_singular) - unit_singular_capitalized = pynini.string_file(get_abs_path("data/money/currency_major_singular_capitalized.tsv")) + unit_singular_capitalized = pynini.string_file( + get_abs_path("data/money/currency_major_singular_capitalized.tsv") + ) unit_singular |= pynini.invert(unit_singular_capitalized).optimize() - + unit_plural = capitalized_input_graph(unit_plural) - unit_plural_capitalized = pynini.string_file(get_abs_path("data/money/currency_major_plural_capitalized.tsv")) + unit_plural_capitalized = pynini.string_file( + get_abs_path("data/money/currency_major_plural_capitalized.tsv") + ) unit_plural |= pynini.invert(unit_plural_capitalized).optimize() unit_minor_singular = capitalized_input_graph(unit_minor_singular).optimize() @@ -92,7 +96,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPU + insert_space + pynutil.insert("fractional_part: \"") + pynini.union( - pynutil.add_weight(((NEMO_SIGMA - one_graph) @ cardinal_graph), -0.7) @ add_leading_zero_to_double_digit + pynutil.add_weight(((NEMO_SIGMA - one_graph) @ cardinal_graph), -0.7) + @ add_leading_zero_to_double_digit + delete_space, pynini.cross(one_graph, "01") + delete_space, ) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py index bd791e359..1fe148216 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py @@ -24,7 +24,6 @@ capitalized_input_graph, delete_space, ) -from pynini.lib import pynutil class OrdinalFst(GraphFst): @@ -113,6 +112,6 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): + pynutil.insert("\"") + pynutil.insert(" morphosyntactic_features: \"er\"") ) - + final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py index 811382baa..2bbd6861f 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py @@ -25,7 +25,6 @@ delete_space, ) from nemo_text_processing.text_normalization.es.graph_utils import ES_PLUS -from pynini.lib import pynutil class TelephoneFst(GraphFst): @@ -138,7 +137,12 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): # optionally denormalize country codes optional_country_code = pynini.closure( - pynini.cross(plus, "+") + delete_space + (single_digits | group_of_two | group_of_three) + insert_separator, 0, 1 + pynini.cross(plus, "+") + + delete_space + + (single_digits | group_of_two | group_of_three) + + insert_separator, + 0, + 1, ) ext_phrase = pynini.accep(" extensión ") diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py index 6a5384639..50f8f40bb 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py @@ -28,7 +28,6 @@ insert_space, ) from nemo_text_processing.text_normalization.es.graph_utils import ES_MINUS, ES_PLUS -from pynini.lib import pynutil class TimeFst(GraphFst): @@ -112,8 +111,8 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): article |= pynini.accep("Las ").optimize() half |= pynini.accep("Media").optimize() quarter |= pynini.accep("Cuarto").optimize() - and_graph |= pynini.union("Y","Con").optimize() - + and_graph |= pynini.union("Y", "Con").optimize() + graph_1oclock = pynini.cross(oneoclock, "la 1") if input_case == INPUT_CASED: graph_1oclock |= pynini.cross(pynini.accep("la Una"), "la 1") @@ -156,10 +155,10 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): ) # las nueve a eme (only convert on-the-hour times if they are followed by a suffix) - graph_1oclock_with_suffix = pynini.closure(pynini.union("la ", "La "), 0, 1) + pynini.cross(pynini.union("una", "Unia"), "1") - graph_hour_with_suffix = pynini.closure(article, 0, 1) + graph_1_to_100 @ pynini.union( - *digits_2_to_23 + graph_1oclock_with_suffix = pynini.closure(pynini.union("la ", "La "), 0, 1) + pynini.cross( + pynini.union("una", "Unia"), "1" ) + graph_hour_with_suffix = pynini.closure(article, 0, 1) + graph_1_to_100 @ pynini.union(*digits_2_to_23) final_graph_hour_with_suffix = ( pynutil.insert("hours: \"") + (graph_1oclock_with_suffix | graph_hour_with_suffix) + pynutil.insert("\"") ) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py index f16f94baa..a2d3fdf29 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py @@ -84,7 +84,9 @@ def __init__( fraction = FractionFst(cardinal, ordinal, input_case=input_case) fraction_graph = fraction.fst - measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, input_case=input_case).fst + measure_graph = MeasureFst( + cardinal=cardinal, decimal=decimal, fraction=fraction, input_case=input_case + ).fst date_graph = DateFst(cardinal, input_case=input_case).fst word_graph = WordFst().fst time_graph = TimeFst(input_case=input_case).fst diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py index d2d6421fd..55f10efc6 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py @@ -12,8 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pynini import os + +import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, @@ -22,7 +25,6 @@ convert_space, ) from nemo_text_processing.text_normalization.en.utils import load_labels -from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py index 397737380..61b9e0b84 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, @@ -21,7 +23,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class DateFst(GraphFst): @@ -42,13 +43,7 @@ def __init__(self): + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) - month = ( - pynutil.delete("month:") - + delete_space - + pynutil.delete("\"") - + graph_month - + pynutil.delete("\"") - ) + month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + graph_month + pynutil.delete("\"") day = ( pynutil.delete("day:") + delete_space diff --git a/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py index 3b47d5c94..ebef11007 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py @@ -102,7 +102,9 @@ def __init__( fraction = FractionFst(cardinal, ordinal, input_case=input_case) fraction_graph = fraction.fst - measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, input_case=input_case).fst + measure_graph = MeasureFst( + cardinal=cardinal, decimal=decimal, fraction=fraction, input_case=input_case + ).fst date_graph = DateFst(cardinal, input_case=input_case).fst word_graph = WordFst().fst time_graph = TimeFst(input_case=input_case).fst diff --git a/nemo_text_processing/text_normalization/es/graph_utils.py b/nemo_text_processing/text_normalization/es/graph_utils.py index 02a2f8e19..6dbe08417 100644 --- a/nemo_text_processing/text_normalization/es/graph_utils.py +++ b/nemo_text_processing/text_normalization/es/graph_utils.py @@ -43,6 +43,7 @@ ES_MINUS = pynini.union("menos", "Menos", "MENOS").optimize() ES_PLUS = pynini.union("más", "Más", "MÁS").optimize() + def strip_accent(fst: 'pynini.FstLike') -> 'pynini.FstLike': """ Converts all accented vowels to non-accented equivalents diff --git a/tests/nemo_text_processing/es/test_cardinal.py b/tests/nemo_text_processing/es/test_cardinal.py index db0f1a6f9..e1b57fca3 100644 --- a/tests/nemo_text_processing/es/test_cardinal.py +++ b/tests/nemo_text_processing/es/test_cardinal.py @@ -39,7 +39,7 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) assert pred == expected - + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_cardinal_cased.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit diff --git a/tests/nemo_text_processing/es/test_date.py b/tests/nemo_text_processing/es/test_date.py index 5256ea5d7..4b12236ee 100644 --- a/tests/nemo_text_processing/es/test_date.py +++ b/tests/nemo_text_processing/es/test_date.py @@ -37,7 +37,7 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) assert pred == expected - + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_cardinal_cased.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit diff --git a/tests/nemo_text_processing/es/test_decimal.py b/tests/nemo_text_processing/es/test_decimal.py index 54cb930c9..7467e45b2 100644 --- a/tests/nemo_text_processing/es/test_decimal.py +++ b/tests/nemo_text_processing/es/test_decimal.py @@ -38,7 +38,6 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) assert pred == expected - @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_decimal_cased.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit diff --git a/tests/nemo_text_processing/es/test_electronic.py b/tests/nemo_text_processing/es/test_electronic.py index 64929e882..ae0e4530c 100644 --- a/tests/nemo_text_processing/es/test_electronic.py +++ b/tests/nemo_text_processing/es/test_electronic.py @@ -38,7 +38,6 @@ def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) assert pred == expected - @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_electronic_cased.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit @@ -46,8 +45,6 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) assert pred == expected - - normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( diff --git a/tests/nemo_text_processing/es/test_fraction.py b/tests/nemo_text_processing/es/test_fraction.py index ec2b856b4..d0e818726 100644 --- a/tests/nemo_text_processing/es/test_fraction.py +++ b/tests/nemo_text_processing/es/test_fraction.py @@ -14,10 +14,11 @@ import pytest +from parameterized import parameterized + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file diff --git a/tests/nemo_text_processing/es/test_measure.py b/tests/nemo_text_processing/es/test_measure.py index e6b5fd941..572c88d03 100644 --- a/tests/nemo_text_processing/es/test_measure.py +++ b/tests/nemo_text_processing/es/test_measure.py @@ -39,7 +39,6 @@ def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) assert pred == expected - @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_measure_cased.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit @@ -47,7 +46,6 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) assert pred == expected - normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( diff --git a/tests/nemo_text_processing/es/test_money.py b/tests/nemo_text_processing/es/test_money.py index d4392e764..acc1fea82 100644 --- a/tests/nemo_text_processing/es/test_money.py +++ b/tests/nemo_text_processing/es/test_money.py @@ -35,11 +35,10 @@ class TestMoney: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected - + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) assert pred == expected - @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_money_cased.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit @@ -47,7 +46,6 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) assert pred == expected - normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( diff --git a/tests/nemo_text_processing/es/test_ordinal.py b/tests/nemo_text_processing/es/test_ordinal.py index d60e567e8..e2cd7d4a2 100644 --- a/tests/nemo_text_processing/es/test_ordinal.py +++ b/tests/nemo_text_processing/es/test_ordinal.py @@ -46,7 +46,6 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) assert pred == expected - normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( NormalizerWithAudio(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) diff --git a/tests/nemo_text_processing/es/test_telephone.py b/tests/nemo_text_processing/es/test_telephone.py index c1f02115a..265f877f6 100644 --- a/tests/nemo_text_processing/es/test_telephone.py +++ b/tests/nemo_text_processing/es/test_telephone.py @@ -35,11 +35,10 @@ class TestTelephone: def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) assert pred == expected - + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) assert pred == expected - @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_telephone_cased.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit @@ -47,7 +46,6 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) assert pred == expected - normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( diff --git a/tests/nemo_text_processing/es/test_word.py b/tests/nemo_text_processing/es/test_word.py index 80a841421..11002ea99 100644 --- a/tests/nemo_text_processing/es/test_word.py +++ b/tests/nemo_text_processing/es/test_word.py @@ -34,11 +34,10 @@ class TestWord: def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) assert pred == expected - + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) assert pred == expected - @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_word_cased.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit @@ -46,7 +45,6 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) assert pred == expected - normalizer_es = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio_es = ( NormalizerWithAudio(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) From e36c8130c3da743df92e0b3eb2e0c11d64b49c19 Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Thu, 29 Feb 2024 07:29:13 -0800 Subject: [PATCH 04/18] style changes Signed-off-by: Mariana Graterol Fuenmayor --- .../es/graph_utils.py | 3 +- .../es/taggers/cardinal.py | 4 -- .../es/taggers/date.py | 2 - .../es/taggers/decimal.py | 20 ++++----- .../es/taggers/electronic.py | 44 +++++++------------ .../es/taggers/fraction.py | 2 - .../es/taggers/measure.py | 2 - .../es/taggers/money.py | 16 ++++--- .../es/taggers/ordinal.py | 4 +- .../es/taggers/punctuation.py | 3 +- .../es/taggers/telephone.py | 9 ++-- .../es/taggers/time.py | 12 +++-- .../es/taggers/tokenize_and_classify.py | 7 +-- .../es/taggers/whitelist.py | 3 +- .../es/taggers/word.py | 3 +- .../es/verbalizers/cardinal.py | 3 +- .../es/verbalizers/date.py | 8 +--- .../es/verbalizers/decimal.py | 3 +- .../es/verbalizers/electronic.py | 3 +- .../es/verbalizers/fraction.py | 3 +- .../es/verbalizers/measure.py | 3 +- .../es/verbalizers/money.py | 3 +- .../es/verbalizers/ordinal.py | 3 +- .../es/verbalizers/telephone.py | 3 +- .../es/verbalizers/time.py | 3 +- .../es/verbalizers/verbalize_final.py | 3 +- .../es/verbalizers/whitelist.py | 3 +- .../es/verbalizers/word.py | 3 +- .../es_en/graph_utils.py | 3 +- .../es_en/taggers/tokenize_and_classify.py | 7 +-- .../es_en/verbalizers/verbalize.py | 3 +- .../es_en/verbalizers/verbalize_final.py | 3 +- 32 files changed, 76 insertions(+), 118 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/es/graph_utils.py b/nemo_text_processing/inverse_text_normalization/es/graph_utils.py index 0396b5b4c..164ebcdbb 100644 --- a/nemo_text_processing/inverse_text_normalization/es/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/es/graph_utils.py @@ -14,9 +14,8 @@ import pynini -from pynini.lib import pynutil - from nemo_text_processing.text_normalization.es.utils import get_abs_path +from pynini.lib import pynutil def int_to_roman(fst: 'pynini.FstLike') -> 'pynini.FstLike': diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py index 2769880d0..907b87c26 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py @@ -14,8 +14,6 @@ import pynini -from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, @@ -178,7 +176,6 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize() self.numbers_up_to_million = numbers_up_to_million.optimize() - if input_case == INPUT_CASED: graph |= capitalized_input_graph(graph) graph_digit |= capitalized_input_graph(graph_digit) @@ -201,7 +198,6 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() - def delete_word(self, word: str): """ Capitalizes word for `cased` input""" delete_graph = pynutil.delete(word).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/date.py b/nemo_text_processing/inverse_text_normalization/es/taggers/date.py index 709a11f3d..8b749bd8f 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/date.py @@ -13,8 +13,6 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.graph_utils import int_to_roman from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py index 2a81da751..ef9aad837 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py @@ -13,8 +13,6 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, @@ -34,7 +32,7 @@ def get_quantity( decimal: 'pynini.FstLike', cardinal_up_to_million: 'pynini.FstLike', input_case: str = INPUT_LOWER_CASED - ) -> 'pynini.FstLike': +) -> 'pynini.FstLike': """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. one million -> integer_part: "1" quantity: "million" @@ -115,7 +113,6 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): decimal_point |= pynini.cross("Coma", "morphosyntactic_features: \",\"") decimal_point |= pynini.cross("Punto", "morphosyntactic_features: \".\"") - optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross(ES_MINUS, "\"true\"") + delete_extra_space, 0, 1 ) @@ -132,22 +129,21 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): ) final_graph = optional_graph_negative + final_graph_wo_sign - self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( - final_graph_wo_sign, cardinal.numbers_up_to_million, input_case=input_case - ).optimize() - + self.final_graph_wo_negative = ( + final_graph_wo_sign + | get_quantity(final_graph_wo_sign, cardinal.numbers_up_to_million, input_case=input_case).optimize() + ) + # accept semiotic spans that start with a capital letter self.final_graph_wo_negative |= pynutil.add_weight( pynini.compose(TO_LOWER + NEMO_SIGMA, self.final_graph_wo_negative), MIN_NEG_WEIGHT ).optimize() - quantity_graph = get_quantity( - final_graph_wo_sign, cardinal.numbers_up_to_million, input_case=input_case - ) + quantity_graph = get_quantity(final_graph_wo_sign, cardinal.numbers_up_to_million, input_case=input_case) final_graph |= optional_graph_negative + quantity_graph if input_case == INPUT_CASED: final_graph |= capitalized_input_graph(final_graph) - + final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py index 0b4e63da4..c40d5454a 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py @@ -78,11 +78,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): server_names = pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).invert() if input_case == INPUT_CASED: server_names = capitalized_input_graph(server_names) - server = ( - single_alphanum - | server_names - | pynini.closure(NEMO_ALPHA, 2) - ) + server = single_alphanum | server_names | pynini.closure(NEMO_ALPHA, 2) if input_case == INPUT_CASED: domain = [] @@ -102,48 +98,42 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): + domain + pynutil.insert("\"") ) - + at = pynini.accep("arroba") if input_case == INPUT_CASED: at |= pynini.accep("Arroba") - graph = ( - username + delete_extra_space + pynutil.delete(at) + insert_space + delete_extra_space + domain_graph - ) + graph = username + delete_extra_space + pynutil.delete(at) + insert_space + delete_extra_space + domain_graph ############# url ### if input_case == INPUT_CASED: spoken_ws = pynini.union( - "doble ve doble ve doble ve", - "Doble Ve Doble Ve Doble Ve", - "Doble ve doble ve doble ve" + "doble ve doble ve doble ve", "Doble Ve Doble Ve Doble Ve", "Doble ve doble ve doble ve" ) protocol_end = pynini.cross(pynini.union(*get_various_formats("www")) | spoken_ws, "www") - spoken_http = pynini.union( - "hache te te pe", - "Hache te te pe", - "Hache Te Te Pe" + spoken_http = pynini.union("hache te te pe", "Hache te te pe", "Hache Te Te Pe") + spoken_https = pynini.union("hache te te pe ese", "Hache te te pe ese", "Hache Te Te Pe Ese") + protocol_start = pynini.cross( + pynini.union(*get_various_formats("http")) | spoken_http, "http" + ) | pynini.cross(pynini.union(*get_various_formats("https")) | spoken_https, "https") + else: + protocol_end = pynutil.add_weight( + pynini.cross(pynini.union("www", "w w w", "doble ve doble ve doble ve"), "www"), MIN_POS_WEIGHT ) - spoken_https = pynini.union( - "hache te te pe ese", - "Hache te te pe ese", - "Hache Te Te Pe Ese" + protocol_start = pynutil.add_weight( + pynini.cross(pynini.union("http", "h t t p", "hache te te pe"), "http"), MIN_POS_WEIGHT ) - protocol_start = pynini.cross(pynini.union(*get_various_formats("http")) | spoken_http, "http") | pynini.cross( - pynini.union(*get_various_formats("https")) | spoken_https, "https" + protocol_start |= pynutil.add_weight( + pynini.cross(pynini.union("https", "h t t p s", "hache te te pe ese"), "https"), MIN_POS_WEIGHT ) - else: - protocol_end = pynutil.add_weight(pynini.cross(pynini.union("www", "w w w", "doble ve doble ve doble ve"), "www"), MIN_POS_WEIGHT) - protocol_start = pynutil.add_weight(pynini.cross(pynini.union("http", "h t t p", "hache te te pe"), "http"), MIN_POS_WEIGHT) - protocol_start |= pynutil.add_weight(pynini.cross(pynini.union("https", "h t t p s", "hache te te pe ese"), "https"), MIN_POS_WEIGHT) protocol_start += pynini.cross(" dos puntos barra barra ", "://") # e.g. .com, .es ending = ( delete_extra_space - + symbols + + symbols + delete_extra_space + (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username) ) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py index d856bfc37..de1ea519f 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py @@ -14,8 +14,6 @@ import pynini -from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import INPUT_LOWER_CASED, NEMO_SIGMA, NEMO_SPACE, GraphFst from nemo_text_processing.text_normalization.es.graph_utils import ES_MINUS diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py index f9091c352..16f0b0073 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py @@ -13,8 +13,6 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/money.py b/nemo_text_processing/inverse_text_normalization/es/taggers/money.py index 9f1dd658a..dbd1f6d69 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/money.py @@ -13,8 +13,6 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, @@ -28,6 +26,7 @@ delete_space, insert_space, ) +from pynini.lib import pynutil class MoneyFst(GraphFst): @@ -60,11 +59,15 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPU if input_case == INPUT_CASED: unit_singular = capitalized_input_graph(unit_singular) - unit_singular_capitalized = pynini.string_file(get_abs_path("data/money/currency_major_singular_capitalized.tsv")) + unit_singular_capitalized = pynini.string_file( + get_abs_path("data/money/currency_major_singular_capitalized.tsv") + ) unit_singular |= pynini.invert(unit_singular_capitalized).optimize() - + unit_plural = capitalized_input_graph(unit_plural) - unit_plural_capitalized = pynini.string_file(get_abs_path("data/money/currency_major_plural_capitalized.tsv")) + unit_plural_capitalized = pynini.string_file( + get_abs_path("data/money/currency_major_plural_capitalized.tsv") + ) unit_plural |= pynini.invert(unit_plural_capitalized).optimize() unit_minor_singular = capitalized_input_graph(unit_minor_singular).optimize() @@ -92,7 +95,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPU + insert_space + pynutil.insert("fractional_part: \"") + pynini.union( - pynutil.add_weight(((NEMO_SIGMA - one_graph) @ cardinal_graph), -0.7) @ add_leading_zero_to_double_digit + pynutil.add_weight(((NEMO_SIGMA - one_graph) @ cardinal_graph), -0.7) + @ add_leading_zero_to_double_digit + delete_space, pynini.cross(one_graph, "01") + delete_space, ) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py index bd791e359..510d94527 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py @@ -13,8 +13,6 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, @@ -113,6 +111,6 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): + pynutil.insert("\"") + pynutil.insert(" morphosyntactic_features: \"er\"") ) - + final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/es/taggers/punctuation.py index c34f732ed..12405d5c5 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/punctuation.py @@ -13,9 +13,8 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from pynini.lib import pynutil class PunctuationFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py index 811382baa..3944f3b0c 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py @@ -13,8 +13,6 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, @@ -138,7 +136,12 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): # optionally denormalize country codes optional_country_code = pynini.closure( - pynini.cross(plus, "+") + delete_space + (single_digits | group_of_two | group_of_three) + insert_separator, 0, 1 + pynini.cross(plus, "+") + + delete_space + + (single_digits | group_of_two | group_of_three) + + insert_separator, + 0, + 1, ) ext_phrase = pynini.accep(" extensión ") diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py index 6a5384639..685b8c358 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py @@ -14,8 +14,6 @@ import pynini -from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, @@ -112,8 +110,8 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): article |= pynini.accep("Las ").optimize() half |= pynini.accep("Media").optimize() quarter |= pynini.accep("Cuarto").optimize() - and_graph |= pynini.union("Y","Con").optimize() - + and_graph |= pynini.union("Y", "Con").optimize() + graph_1oclock = pynini.cross(oneoclock, "la 1") if input_case == INPUT_CASED: graph_1oclock |= pynini.cross(pynini.accep("la Una"), "la 1") @@ -156,10 +154,10 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): ) # las nueve a eme (only convert on-the-hour times if they are followed by a suffix) - graph_1oclock_with_suffix = pynini.closure(pynini.union("la ", "La "), 0, 1) + pynini.cross(pynini.union("una", "Unia"), "1") - graph_hour_with_suffix = pynini.closure(article, 0, 1) + graph_1_to_100 @ pynini.union( - *digits_2_to_23 + graph_1oclock_with_suffix = pynini.closure(pynini.union("la ", "La "), 0, 1) + pynini.cross( + pynini.union("una", "Unia"), "1" ) + graph_hour_with_suffix = pynini.closure(article, 0, 1) + graph_1_to_100 @ pynini.union(*digits_2_to_23) final_graph_hour_with_suffix = ( pynutil.insert("hours: \"") + (graph_1oclock_with_suffix | graph_hour_with_suffix) + pynutil.insert("\"") ) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py index f16f94baa..347538787 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py @@ -15,8 +15,6 @@ import os import pynini -from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.es.taggers.date import DateFst from nemo_text_processing.inverse_text_normalization.es.taggers.decimal import DecimalFst @@ -38,6 +36,7 @@ generator_main, ) from nemo_text_processing.utils.logging import logger +from pynini.lib import pynutil class ClassifyFst(GraphFst): @@ -84,7 +83,9 @@ def __init__( fraction = FractionFst(cardinal, ordinal, input_case=input_case) fraction_graph = fraction.fst - measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, input_case=input_case).fst + measure_graph = MeasureFst( + cardinal=cardinal, decimal=decimal, fraction=fraction, input_case=input_case + ).fst date_graph = DateFst(cardinal, input_case=input_case).fst word_graph = WordFst().fst time_graph = TimeFst(input_case=input_case).fst diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py index d2d6421fd..4aae8490a 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pynini import os + +import pynini from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/word.py b/nemo_text_processing/inverse_text_normalization/es/taggers/word.py index b09f941e2..57f143d0d 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/word.py @@ -13,9 +13,8 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst +from pynini.lib import pynutil class WordFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/cardinal.py index 46fca2a1c..e6737be6e 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/cardinal.py @@ -13,9 +13,8 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from pynini.lib import pynutil class CardinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py index 397737380..d320d0b21 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py @@ -42,13 +42,7 @@ def __init__(self): + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) - month = ( - pynutil.delete("month:") - + delete_space - + pynutil.delete("\"") - + graph_month - + pynutil.delete("\"") - ) + month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + graph_month + pynutil.delete("\"") day = ( pynutil.delete("day:") + delete_space diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/decimal.py index c9b060ec1..b443733a2 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/decimal.py @@ -13,9 +13,8 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from pynini.lib import pynutil class DecimalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/electronic.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/electronic.py index 069716123..cf6bdc779 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/electronic.py @@ -13,9 +13,8 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from pynini.lib import pynutil class ElectronicFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/fraction.py index e553d95e9..eae72ab4b 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/fraction.py @@ -14,9 +14,8 @@ import pynini -from pynini.lib import pynutil - from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space +from pynini.lib import pynutil class FractionFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/measure.py index 6162f0c20..0bd8f7460 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/measure.py @@ -13,14 +13,13 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, GraphFst, delete_extra_space, delete_space, ) +from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/money.py index 60e9b7aeb..a52371f1e 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/money.py @@ -13,9 +13,8 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space +from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/ordinal.py index c1c9bdb46..8cfb15095 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/ordinal.py @@ -13,9 +13,8 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from pynini.lib import pynutil class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py index 58aa190ba..bc32f62fe 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py @@ -13,9 +13,8 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst +from pynini.lib import pynutil class TelephoneFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py index 1d878f2d1..ab7913b64 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py @@ -13,8 +13,6 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, @@ -22,6 +20,7 @@ delete_space, insert_space, ) +from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py index 6b22d6f73..26d09996b 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py @@ -13,11 +13,10 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.es.verbalizers.word import WordFst from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space +from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py index 606a4e569..cc231a46b 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py @@ -14,9 +14,8 @@ import pynini -from pynini.lib import pynutil - from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space +from pynini.lib import pynutil class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/word.py index 8c0bd08b1..3a5ba96b9 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/word.py @@ -13,9 +13,8 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space +from pynini.lib import pynutil class WordFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es_en/graph_utils.py b/nemo_text_processing/inverse_text_normalization/es_en/graph_utils.py index 0396b5b4c..164ebcdbb 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/graph_utils.py @@ -14,9 +14,8 @@ import pynini -from pynini.lib import pynutil - from nemo_text_processing.text_normalization.es.utils import get_abs_path +from pynini.lib import pynutil def int_to_roman(fst: 'pynini.FstLike') -> 'pynini.FstLike': diff --git a/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py index 3b47d5c94..8597bb4f6 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py @@ -15,8 +15,6 @@ import os import pynini -from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.en.taggers.cardinal import CardinalFst as EnCardinalFst from nemo_text_processing.inverse_text_normalization.en.taggers.date import DateFst as EnDateFst from nemo_text_processing.inverse_text_normalization.en.taggers.decimal import DecimalFst as EnDecimalFst @@ -51,6 +49,7 @@ generator_main, ) from nemo_text_processing.utils.logging import logger +from pynini.lib import pynutil class ClassifyFst(GraphFst): @@ -102,7 +101,9 @@ def __init__( fraction = FractionFst(cardinal, ordinal, input_case=input_case) fraction_graph = fraction.fst - measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, input_case=input_case).fst + measure_graph = MeasureFst( + cardinal=cardinal, decimal=decimal, fraction=fraction, input_case=input_case + ).fst date_graph = DateFst(cardinal, input_case=input_case).fst word_graph = WordFst().fst time_graph = TimeFst(input_case=input_case).fst diff --git a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py index 483d083c8..3eb7ba3a6 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.en.verbalizers.cardinal import CardinalFst as EnCardinalFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.date import DateFst as EnDateFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.decimal import DecimalFst as EnDecimalFst @@ -36,6 +34,7 @@ from nemo_text_processing.inverse_text_normalization.es.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.es.verbalizers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from pynini.lib import pynutil class VerbalizeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py index 3323f173b..65d9b91c4 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py @@ -13,11 +13,10 @@ # limitations under the License. import pynini -from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.verbalizers.word import WordFst from nemo_text_processing.inverse_text_normalization.es_en.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space +from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): From 16eeecc93e532a8e1c0b4652e18f9bf84ab6f5ef Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 29 Feb 2024 15:32:16 +0000 Subject: [PATCH 05/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/es/graph_utils.py | 3 ++- .../inverse_text_normalization/es/taggers/money.py | 3 ++- .../inverse_text_normalization/es/taggers/punctuation.py | 3 ++- .../es/taggers/tokenize_and_classify.py | 3 ++- .../inverse_text_normalization/es/taggers/word.py | 3 ++- .../inverse_text_normalization/es/verbalizers/cardinal.py | 3 ++- .../inverse_text_normalization/es/verbalizers/decimal.py | 3 ++- .../inverse_text_normalization/es/verbalizers/electronic.py | 3 ++- .../inverse_text_normalization/es/verbalizers/fraction.py | 3 ++- .../inverse_text_normalization/es/verbalizers/measure.py | 3 ++- .../inverse_text_normalization/es/verbalizers/money.py | 3 ++- .../inverse_text_normalization/es/verbalizers/ordinal.py | 3 ++- .../inverse_text_normalization/es/verbalizers/telephone.py | 3 ++- .../inverse_text_normalization/es/verbalizers/time.py | 3 ++- .../es/verbalizers/verbalize_final.py | 3 ++- .../inverse_text_normalization/es/verbalizers/whitelist.py | 3 ++- .../inverse_text_normalization/es/verbalizers/word.py | 3 ++- .../inverse_text_normalization/es_en/graph_utils.py | 3 ++- .../es_en/taggers/tokenize_and_classify.py | 3 ++- .../inverse_text_normalization/es_en/verbalizers/verbalize.py | 3 ++- .../es_en/verbalizers/verbalize_final.py | 3 ++- 21 files changed, 42 insertions(+), 21 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/es/graph_utils.py b/nemo_text_processing/inverse_text_normalization/es/graph_utils.py index 164ebcdbb..0396b5b4c 100644 --- a/nemo_text_processing/inverse_text_normalization/es/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/es/graph_utils.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.es.utils import get_abs_path from pynini.lib import pynutil +from nemo_text_processing.text_normalization.es.utils import get_abs_path + def int_to_roman(fst: 'pynini.FstLike') -> 'pynini.FstLike': """ diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/money.py b/nemo_text_processing/inverse_text_normalization/es/taggers/money.py index dbd1f6d69..a65d9bcd2 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/money.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, @@ -26,7 +28,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class MoneyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/es/taggers/punctuation.py index 12405d5c5..c34f732ed 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/punctuation.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + class PunctuationFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py index 347538787..a2d3fdf29 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.es.taggers.date import DateFst from nemo_text_processing.inverse_text_normalization.es.taggers.decimal import DecimalFst @@ -36,7 +38,6 @@ generator_main, ) from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/word.py b/nemo_text_processing/inverse_text_normalization/es/taggers/word.py index 57f143d0d..b09f941e2 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/word.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/cardinal.py index e6737be6e..46fca2a1c 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/cardinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class CardinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/decimal.py index b443733a2..c9b060ec1 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/decimal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class DecimalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/electronic.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/electronic.py index cf6bdc779..069716123 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/electronic.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class ElectronicFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/fraction.py index eae72ab4b..e553d95e9 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/fraction.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space + class FractionFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/measure.py index 0bd8f7460..6162f0c20 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/measure.py @@ -13,13 +13,14 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, GraphFst, delete_extra_space, delete_space, ) -from pynini.lib import pynutil class MeasureFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/money.py index a52371f1e..60e9b7aeb 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/money.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space + class MoneyFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/ordinal.py index 8cfb15095..c1c9bdb46 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/ordinal.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + class OrdinalFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py index bc32f62fe..58aa190ba 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/telephone.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst + class TelephoneFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py index ab7913b64..1d878f2d1 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py @@ -13,6 +13,8 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, @@ -20,7 +22,6 @@ delete_space, insert_space, ) -from pynini.lib import pynutil class TimeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py index 26d09996b..6b22d6f73 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py @@ -13,10 +13,11 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.es.verbalizers.word import WordFst from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py index cc231a46b..606a4e569 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + class WhiteListFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/word.py index 3a5ba96b9..8c0bd08b1 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/word.py @@ -13,9 +13,10 @@ # limitations under the License. import pynini -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + class WordFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/es_en/graph_utils.py b/nemo_text_processing/inverse_text_normalization/es_en/graph_utils.py index 164ebcdbb..0396b5b4c 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/graph_utils.py @@ -14,9 +14,10 @@ import pynini -from nemo_text_processing.text_normalization.es.utils import get_abs_path from pynini.lib import pynutil +from nemo_text_processing.text_normalization.es.utils import get_abs_path + def int_to_roman(fst: 'pynini.FstLike') -> 'pynini.FstLike': """ diff --git a/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py index 8597bb4f6..ebef11007 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py @@ -15,6 +15,8 @@ import os import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.taggers.cardinal import CardinalFst as EnCardinalFst from nemo_text_processing.inverse_text_normalization.en.taggers.date import DateFst as EnDateFst from nemo_text_processing.inverse_text_normalization.en.taggers.decimal import DecimalFst as EnDecimalFst @@ -49,7 +51,6 @@ generator_main, ) from nemo_text_processing.utils.logging import logger -from pynini.lib import pynutil class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py index 3eb7ba3a6..483d083c8 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.en.verbalizers.cardinal import CardinalFst as EnCardinalFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.date import DateFst as EnDateFst from nemo_text_processing.inverse_text_normalization.en.verbalizers.decimal import DecimalFst as EnDecimalFst @@ -34,7 +36,6 @@ from nemo_text_processing.inverse_text_normalization.es.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.es.verbalizers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.en.graph_utils import GraphFst -from pynini.lib import pynutil class VerbalizeFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py index 65d9b91c4..3323f173b 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize_final.py @@ -13,10 +13,11 @@ # limitations under the License. import pynini +from pynini.lib import pynutil + from nemo_text_processing.inverse_text_normalization.es.verbalizers.word import WordFst from nemo_text_processing.inverse_text_normalization.es_en.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space -from pynini.lib import pynutil class VerbalizeFinalFst(GraphFst): From 1ea5fd5ac7e12cf51400f7406fa36c0249c3d69d Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Tue, 26 Mar 2024 08:35:58 -0700 Subject: [PATCH 06/18] fix imports Signed-off-by: Mariana Graterol Fuenmayor --- .../inverse_text_normalization/es/taggers/cardinal.py | 1 + .../inverse_text_normalization/es/taggers/date.py | 1 + .../inverse_text_normalization/es/taggers/decimal.py | 1 + .../inverse_text_normalization/es/taggers/fraction.py | 1 + .../inverse_text_normalization/es/taggers/measure.py | 1 + .../inverse_text_normalization/es/taggers/ordinal.py | 1 + .../inverse_text_normalization/es/taggers/telephone.py | 1 + .../inverse_text_normalization/es/taggers/time.py | 1 + 8 files changed, 8 insertions(+) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py index 50640f54f..2f62d589d 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py @@ -14,6 +14,7 @@ import pynini +from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/date.py b/nemo_text_processing/inverse_text_normalization/es/taggers/date.py index 9b7c1a182..af96ee002 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/date.py @@ -13,6 +13,7 @@ # limitations under the License. import pynini +from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.es.graph_utils import int_to_roman from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py index 3f53a3e43..2b1949041 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py @@ -13,6 +13,7 @@ # limitations under the License. import pynini +from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py index c372f3999..a2b55026e 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py @@ -14,6 +14,7 @@ import pynini +from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import INPUT_LOWER_CASED, NEMO_SIGMA, NEMO_SPACE, GraphFst from nemo_text_processing.text_normalization.es.graph_utils import ES_MINUS diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py index 89989a804..9d231bc25 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py @@ -13,6 +13,7 @@ # limitations under the License. import pynini +from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py index 97c271d0b..2888b0056 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py @@ -13,6 +13,7 @@ # limitations under the License. import pynini +from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py index 7fad41513..52aa0ecf5 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py @@ -13,6 +13,7 @@ # limitations under the License. import pynini +from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py index a8525edb6..f35cadec8 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py @@ -14,6 +14,7 @@ import pynini +from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, From fa0a97003a9b6fc8063e72720c3f969b3959d66a Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Tue, 25 Jun 2024 14:51:55 -0700 Subject: [PATCH 07/18] mod eval scripts Signed-off-by: Mariana Graterol Fuenmayor --- .../run_evaluate.py | 21 ++++++++++++++----- .../text_normalization/data_loader_utils.py | 15 ++++++------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 29fc935cf..393dc5f05 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,8 +33,14 @@ def parse_args(): parser = ArgumentParser() parser.add_argument("--input", help="input file path", type=str) parser.add_argument( - "--lang", help="language", choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'vi', 'hy'], default="en", type=str + "--lang", + help="language", + choices=["ar", "de", "en", "es", "es_en", "fr", "hy", "mr", "pt", "ru", "sv", "vi", "zh"], + default="en", + type=str, ) + parser.add_argument("--input_case", choices=["lower_cased", "cased"]) + parser.add_argument("--output_case", choices=["lower_cased", "cased"]) parser.add_argument( "--cat", dest="category", @@ -54,10 +60,15 @@ def parse_args(): if args.lang == 'en': from nemo_text_processing.inverse_text_normalization.en.clean_eval_data import filter_loaded_data file_path = args.input - inverse_normalizer = InverseNormalizer(lang=args.lang) + inverse_normalizer = InverseNormalizer(lang=args.lang, input_case=args.input_case) print("Loading training data: " + file_path) - training_data = load_files([file_path]) + if args.output_case == "lower_cased": + to_lower = True + elif args.output_case == "cased": + to_lower = False + + training_data = load_files([file_path], to_lower=to_lower) if args.filter: training_data = filter_loaded_data(training_data) @@ -109,4 +120,4 @@ def parse_args(): print(f'{str(c1[i]):10s} | {str(c2[i]):10s} | {str(c3[i]):5s}') else: print(f'numbers\t{token_count_per_type[args.category]}') - print(f'Denormalization\t{token_accuracy[args.category]}') + print(f'Denormalization\t{token_accuracy[args.category]}') \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/data_loader_utils.py b/nemo_text_processing/text_normalization/data_loader_utils.py index b13851313..f184dc7a5 100644 --- a/nemo_text_processing/text_normalization/data_loader_utils.py +++ b/nemo_text_processing/text_normalization/data_loader_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -46,7 +46,7 @@ ] -def _load_kaggle_text_norm_file(file_path: str) -> List[Instance]: +def _load_kaggle_text_norm_file(file_path: str, to_lower: bool) -> List[Instance]: """ https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish Loads text file in the Kaggle Google text normalization file format: \t\t<`self` if trivial class or normalized text> @@ -76,8 +76,9 @@ def _load_kaggle_text_norm_file(file_path: str) -> List[Instance]: res.append(Instance(token_type=EOS_TYPE, un_normalized="", normalized="")) else: l_type, l_token, l_normalized = parts - l_token = l_token.lower() - l_normalized = l_normalized.lower() + if to_lower: + l_token = l_token.lower() + l_normalized = l_normalized.lower() if l_type == PLAIN_TYPE: res.append(Instance(token_type=l_type, un_normalized=l_token, normalized=l_token)) @@ -86,7 +87,7 @@ def _load_kaggle_text_norm_file(file_path: str) -> List[Instance]: return res -def load_files(file_paths: List[str], load_func=_load_kaggle_text_norm_file) -> List[Instance]: +def load_files(file_paths: List[str], load_func=_load_kaggle_text_norm_file, to_lower: bool = True) -> List[Instance]: """ Load given list of text files using the `load_func` function. @@ -98,7 +99,7 @@ def load_files(file_paths: List[str], load_func=_load_kaggle_text_norm_file) -> """ res = [] for file_path in file_paths: - res.extend(load_func(file_path=file_path)) + res.extend(load_func(file_path=file_path, to_lower=to_lower)) return res @@ -348,4 +349,4 @@ def _is_valid(idx_out, idx_in, normalized_text, input): logger.info(f"Skipping post-processing of {''.join(normalized_text)} for '{punct}'") normalized_text = "".join(normalized_text) - return re.sub(r' +', ' ', normalized_text) + return re.sub(r' +', ' ', normalized_text) \ No newline at end of file From f9f69454902dd8c5b3d07f38f0beee7d2550e723 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 25 Jun 2024 21:52:13 +0000 Subject: [PATCH 08/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo_text_processing/inverse_text_normalization/run_evaluate.py | 2 +- nemo_text_processing/text_normalization/data_loader_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 393dc5f05..c59852942 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -120,4 +120,4 @@ def parse_args(): print(f'{str(c1[i]):10s} | {str(c2[i]):10s} | {str(c3[i]):5s}') else: print(f'numbers\t{token_count_per_type[args.category]}') - print(f'Denormalization\t{token_accuracy[args.category]}') \ No newline at end of file + print(f'Denormalization\t{token_accuracy[args.category]}') diff --git a/nemo_text_processing/text_normalization/data_loader_utils.py b/nemo_text_processing/text_normalization/data_loader_utils.py index f184dc7a5..01a85ec10 100644 --- a/nemo_text_processing/text_normalization/data_loader_utils.py +++ b/nemo_text_processing/text_normalization/data_loader_utils.py @@ -349,4 +349,4 @@ def _is_valid(idx_out, idx_in, normalized_text, input): logger.info(f"Skipping post-processing of {''.join(normalized_text)} for '{punct}'") normalized_text = "".join(normalized_text) - return re.sub(r' +', ' ', normalized_text) \ No newline at end of file + return re.sub(r' +', ' ', normalized_text) From a85e03930ba83b225c4f73c4e9ecd76838593e20 Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Tue, 25 Jun 2024 15:16:50 -0700 Subject: [PATCH 09/18] update whitelist Signed-off-by: Mariana Graterol Fuenmayor --- .../es/data/whitelist.tsv | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/nemo_text_processing/inverse_text_normalization/es/data/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/es/data/whitelist.tsv index 8d81f2c09..3973c1a0c 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/whitelist.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/whitelist.tsv @@ -12,5 +12,36 @@ Prof. profesor Profa. profesora Sr. señor Sra. señora +Sres. señores Srta. señorita etc. etcétera +TXT t x t +TXT T x t +TXT T X T +GPS g p s +GPS G p s +GPS G P S +DNI d n i +DNI D n i +DNI D N I +ISSN I S S N +ISSN I s s n +ISSN i s s n +SMS S M S +SMS S m s +SMS s m s +ISBN I S B N +ISBN I s b n +ISBN i s b n +URL U R L +URL U r l +URL u r l +RPG R P G +RPG R p g +RPG r p g +ADN A D N +ADN A d n +ADN a d n +JPG J P G +JPG J p g +JPG j p g \ No newline at end of file From 2dbaf5628c6ed08b8bb00dd01653406f6ad79e31 Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Tue, 25 Jun 2024 16:56:17 -0700 Subject: [PATCH 10/18] update currencies for es itn Signed-off-by: Mariana Graterol Fuenmayor --- .../es/data/money/currency_major_plural.tsv | 1 + .../es/data/money/currency_major_plural_capitalized.tsv | 3 +++ .../es/data/money/currency_major_singular.tsv | 1 + .../es/data/money/currency_major_singular_capitalized.tsv | 3 +++ 4 files changed, 8 insertions(+) diff --git a/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural.tsv b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural.tsv index 43353c7ee..5d6f9a530 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural.tsv @@ -1,6 +1,7 @@ € euros US$ dólares estadounidenses US$ dólares americanos +CAD$ dólares canadienses $ dólares $ pesos ¥ yenes diff --git a/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural_capitalized.tsv b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural_capitalized.tsv index 129f641e5..c65809d3d 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural_capitalized.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural_capitalized.tsv @@ -4,6 +4,9 @@ US$ Dólares estadounidenses US$ Dólares Americanos US$ dólares Americanos US$ Dólares americanos +CAD$ dólares Canadienses +CAD$ Dólares canadienses +CAD$ Dólares Canadienses AR$ Pesos Argentinos AR$ pesos Argentinos AR$ Pesos argentinos diff --git a/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular.tsv b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular.tsv index 879c8f1fe..3532bb2dd 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular.tsv @@ -1,6 +1,7 @@ € euro US$ dólar estadounidense US$ dólar americano +CAD$ dólar canadiense $ dólar $ peso ¥ yen diff --git a/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular_capitalized.tsv b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular_capitalized.tsv index 995741a2f..7e8ba2611 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular_capitalized.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular_capitalized.tsv @@ -5,6 +5,9 @@ US$ dólar Estadounidense US$ dólar Americano US$ Dólar Americano US$ Dólar americano +CAD$ dólar Canadiense +CAD$ Dólar canadiense +CAD$ Dólar Canadiense AR$ peso Argentino AR$ Peso Argentino AR$ Peso argentino From e95cb4f69879b4aa7b4556416102fb9a3e827762 Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Wed, 3 Jul 2024 13:04:25 -0700 Subject: [PATCH 11/18] bugfix for es time grammar Signed-off-by: Mariana Graterol Fuenmayor --- Jenkinsfile | 2 +- .../inverse_text_normalization/es/taggers/time.py | 6 ++++++ .../es/taggers/tokenize_and_classify.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index d1b4062e4..7df3dac2b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -15,7 +15,7 @@ pipeline { DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-18-23-0' ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0' - ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-2' + ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-03-23-0' FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/12-05-23-0' HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py index f35cadec8..e97c01c8a 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py @@ -101,6 +101,8 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): half = pynini.accep("media") quarter = pynini.accep("cuarto") and_graph = pynini.union("y", "con") + hours_word_graph = pynini.accep(" horas") + minutes_word_graph = pynini.union(" minuto", " minutos") if input_case == INPUT_CASED: suffix_graph |= pynini.string_file(get_abs_path("data/time/time_suffix_cased.tsv")).optimize() @@ -111,6 +113,8 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): half |= pynini.accep("Media").optimize() quarter |= pynini.accep("Cuarto").optimize() and_graph |= pynini.union("Y", "Con").optimize() + hours_word_graph |= pynini.accep(" Horas").optimize() + minutes_word_graph |= pynini.union(" Minuto", " Minutos").optimize() graph_1oclock = pynini.cross(oneoclock, "la 1") if input_case == INPUT_CASED: @@ -128,6 +132,8 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): pynutil.insert("minutes: \"") + pynini.closure(pynutil.delete(and_graph) + delete_space, 0, 1) + (graph_minute | graph_minute_verbose) + + pynini.closure(pynutil.delete(minutes_word_graph), 0, 1) + + pynini.closure(pynutil.delete(hours_word_graph), 0, 1) + pynutil.insert("\"") ).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py index a2d3fdf29..26483fb83 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py @@ -98,7 +98,7 @@ def __init__( classify = ( pynutil.add_weight(whitelist_graph, 1.01) - | pynutil.add_weight(time_graph, 1.1) + | pynutil.add_weight(time_graph, 1.08) | pynutil.add_weight(date_graph, 1.09) | pynutil.add_weight(decimal_graph, 1.09) | pynutil.add_weight(fraction_graph, 1.09) From a00b6a82faaa81188de5e7688bf789a217a00781 Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Wed, 3 Jul 2024 13:32:34 -0700 Subject: [PATCH 12/18] update cache for es Signed-off-by: Mariana Graterol Fuenmayor --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 7df3dac2b..ca08116c5 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -14,7 +14,7 @@ pipeline { AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-23-23-0' DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-18-23-0' - ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0' + ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-03-23-0' ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-03-23-0' FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/12-05-23-0' HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' From 52b14936ef1ef1c8b8dbe8837b9ea807178c24e6 Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Tue, 23 Jul 2024 08:24:29 -0700 Subject: [PATCH 13/18] ordinal tagger fix Signed-off-by: Mariana Graterol Fuenmayor --- .../inverse_text_normalization/es/taggers/ordinal.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py index 2888b0056..d03640742 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py @@ -88,7 +88,6 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): graph_er_suffix = (pynini.project(graph_er_suffix, "input") - graph_exception.arcsort()) @ graph_er_suffix if input_case == INPUT_CASED: - graph_exception = capitalized_input_graph(graph_exception) graph_o_suffix = capitalized_input_graph(graph_o_suffix) graph_a_suffix = capitalized_input_graph(graph_a_suffix) graph_er_suffix = capitalized_input_graph(graph_er_suffix) From d67fff617dec28aba42f0a7a60839326c9ee844e Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Tue, 23 Jul 2024 08:25:47 -0700 Subject: [PATCH 14/18] telephone tagger fix Signed-off-by: Mariana Graterol Fuenmayor --- .../inverse_text_normalization/es/taggers/telephone.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py index 52aa0ecf5..1c0be2037 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py @@ -18,7 +18,6 @@ from nemo_text_processing.text_normalization.en.graph_utils import ( INPUT_CASED, INPUT_LOWER_CASED, - MIN_NEG_WEIGHT, GraphFst, capitalized_input_graph, delete_space, From f627075cc9a852b43e188978c118d995b5fafe3c Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Tue, 23 Jul 2024 08:30:09 -0700 Subject: [PATCH 15/18] time tagger fix Signed-off-by: Mariana Graterol Fuenmayor --- .../es/taggers/time.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py index e97c01c8a..eb885c15f 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py @@ -96,7 +96,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): digits_2_to_23 = [str(digits) for digits in range(2, 24)] digits_1_to_59 = [str(digits) for digits in range(1, 60)] - oneoclock = pynini.accep("la una") + one_o_clock = pynini.accep("la una") article = pynini.accep("las ") half = pynini.accep("media") quarter = pynini.accep("cuarto") @@ -116,17 +116,17 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): hours_word_graph |= pynini.accep(" Horas").optimize() minutes_word_graph |= pynini.union(" Minuto", " Minutos").optimize() - graph_1oclock = pynini.cross(oneoclock, "la 1") + graph_one_o_clock = pynini.cross(one_o_clock, "la 1") if input_case == INPUT_CASED: - graph_1oclock |= pynini.cross(pynini.accep("la Una"), "la 1") - oneoclock_capitalized = pynini.union("La Una", "La una") - graph_1oclock |= pynini.cross(oneoclock_capitalized, "La 1").optimize() + graph_one_o_clock |= pynini.cross(pynini.accep("la Una"), "la 1") + one_o_clock_capitalized = pynini.union("La Una", "La una") + graph_one_o_clock |= pynini.cross(one_o_clock_capitalized, "La 1").optimize() graph_hour = article + graph_1_to_100 @ pynini.union(*digits_2_to_23) graph_minute = graph_1_to_100 @ pynini.union(*digits_1_to_59) graph_minute_verbose = pynini.cross(half, "30") | pynini.cross(quarter, "15") - final_graph_hour = pynutil.insert("hours: \"") + (graph_1oclock | graph_hour) + pynutil.insert("\"") + final_graph_hour = pynutil.insert("hours: \"") + (graph_one_o_clock | graph_hour) + pynutil.insert("\"") final_graph_minute = ( pynutil.insert("minutes: \"") @@ -160,12 +160,12 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): ) # las nueve a eme (only convert on-the-hour times if they are followed by a suffix) - graph_1oclock_with_suffix = pynini.closure(pynini.union("la ", "La "), 0, 1) + pynini.cross( - pynini.union("una", "Unia"), "1" + graph_one_o_clock_with_suffix = pynini.closure(pynini.union("la ", "La "), 0, 1) + pynini.cross( + pynini.union("una", "Una"), "1" ) graph_hour_with_suffix = pynini.closure(article, 0, 1) + graph_1_to_100 @ pynini.union(*digits_2_to_23) final_graph_hour_with_suffix = ( - pynutil.insert("hours: \"") + (graph_1oclock_with_suffix | graph_hour_with_suffix) + pynutil.insert("\"") + pynutil.insert("hours: \"") + (graph_one_o_clock_with_suffix | graph_hour_with_suffix) + pynutil.insert("\"") ) graph_hsuffix = ( From b7a09c8f043bd3d7a841169d87efd3f92ca9824c Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Tue, 23 Jul 2024 08:35:15 -0700 Subject: [PATCH 16/18] time verbalizer fix Signed-off-by: Mariana Graterol Fuenmayor --- .../inverse_text_normalization/es/verbalizers/time.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py index 1d878f2d1..44892b0be 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py @@ -35,14 +35,14 @@ def __init__(self): super().__init__(name="time", kind="verbalize") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) - # hour may or may not include preposition ("la" or "las") - preposition = pynini.union("la ", "las ", "La ", "Las ") + # hour may or may not include article ("la" or "las") + article = pynini.union("la ", "las ", "La ", "Las ") hour = ( pynutil.delete("hours:") + delete_space + pynutil.delete("\"") - + pynini.closure(preposition, 0, 1) + + pynini.closure(article, 0, 1) + pynini.closure(NEMO_DIGIT, 1) + pynutil.delete("\"") ) From e232c43c5e672ac76191ce5522ae03a139990fbe Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Tue, 23 Jul 2024 11:48:14 -0700 Subject: [PATCH 17/18] update cache Signed-off-by: Mariana Graterol Fuenmayor --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 4c7aaefd7..425105a18 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -14,8 +14,8 @@ pipeline { AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0' DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0' EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-06-24-0' - ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-03-34-0' - ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-03-34-0' + ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-23-24-0' + ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-23-24-0' FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0' HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' From 86ffe6cab0d5a708f3c3e6f223e798fcaa6dc1dd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 18:51:17 +0000 Subject: [PATCH 18/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/es/taggers/time.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py index eb885c15f..9d55f35a3 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py @@ -165,7 +165,9 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): ) graph_hour_with_suffix = pynini.closure(article, 0, 1) + graph_1_to_100 @ pynini.union(*digits_2_to_23) final_graph_hour_with_suffix = ( - pynutil.insert("hours: \"") + (graph_one_o_clock_with_suffix | graph_hour_with_suffix) + pynutil.insert("\"") + pynutil.insert("hours: \"") + + (graph_one_o_clock_with_suffix | graph_hour_with_suffix) + + pynutil.insert("\"") ) graph_hsuffix = (