diff --git a/Jenkinsfile b/Jenkinsfile index bdfc55d21..4af74311e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -14,8 +14,8 @@ pipeline { AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0' DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0' EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-02-24-0' - ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0' - ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-2' + ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-23-24-0' + ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-23-24-0' FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0' HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0' PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' diff --git a/nemo_text_processing/inverse_text_normalization/es/data/dates/months_cased.tsv b/nemo_text_processing/inverse_text_normalization/es/data/dates/months_cased.tsv new file mode 100644 index 000000000..137183097 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/dates/months_cased.tsv @@ -0,0 +1,12 @@ +Enero +Febrero +Marzo +Abril +Mayo +Junio +Julio +Agosto +Septiembre +Octubre +Noviembre +Diciembre \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/dates/year_suffix_cased.tsv b/nemo_text_processing/inverse_text_normalization/es/data/dates/year_suffix_cased.tsv new file mode 100644 index 000000000..221fd3605 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/dates/year_suffix_cased.tsv @@ -0,0 +1,11 @@ +A. N. E. antes de nuestra era +A. E. C. antes de la era común +A. C. antes de Cristo +A. J. C. antes de Jesucristo +A. P. antes del presente +N. E. nuestra era +E. C. era común +D. C. después de Cristo +D. D. J. C. después de Jesucristo +B. C. B C +A. D. a d diff --git a/nemo_text_processing/inverse_text_normalization/es/data/measures/measurements_plural.tsv b/nemo_text_processing/inverse_text_normalization/es/data/measures/measurements_plural.tsv index e0f4284cc..1986f5cdc 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/measures/measurements_plural.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/measures/measurements_plural.tsv @@ -23,6 +23,12 @@ gsm g s m gsm ge ese eme psi p s i psi pe ese i -° c grados centígrados -° f grados farenheit -° k grados kelvin +° C grados centígrados +° F grados farenheit +° K grados kelvin +mb megabits +MB megabytes +gb gigabits +GB gigabytes +TB terabytes +PB petabytes \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/measures/measurements_singular.tsv b/nemo_text_processing/inverse_text_normalization/es/data/measures/measurements_singular.tsv index 22163c72f..13f977c83 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/measures/measurements_singular.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/measures/measurements_singular.tsv @@ -17,9 +17,9 @@ min minuto % por ciento % porciento s segundo -° c grado centígrado -° f grado farenheit -° k grado kelvin +° C grado centígrado +° F grado farenheit +° K grado kelvin mph milla por hora kph kilómetro por hora gsm gramo por metro cuadrado @@ -27,3 +27,9 @@ gsm g s m gsm ge ese eme psi p s i psi pe ese i +mb megabit +MB megabyte +gb gigabit +GB gigabyte +TB terabyte +PB petabyte \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural.tsv b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural.tsv index 43353c7ee..5d6f9a530 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural.tsv @@ -1,6 +1,7 @@ € euros US$ dólares estadounidenses US$ dólares americanos +CAD$ dólares canadienses $ dólares $ pesos ¥ yenes diff --git a/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural_capitalized.tsv b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural_capitalized.tsv new file mode 100644 index 000000000..c65809d3d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_plural_capitalized.tsv @@ -0,0 +1,75 @@ +US$ Dólares Estadounidenses +US$ dólares Estadounidenses +US$ Dólares estadounidenses +US$ Dólares Americanos +US$ dólares Americanos +US$ Dólares americanos +CAD$ dólares Canadienses +CAD$ Dólares canadienses +CAD$ Dólares Canadienses +AR$ Pesos Argentinos +AR$ pesos Argentinos +AR$ Pesos argentinos +BRL Reales Brasileños +BRL reales Brasileños +BRL Reales brasileños +CHF Francos Suizos +CHF francos Suizos +CHF Francos suizos +CLP Pesos Chilenos +CLP pesos Chilenos +CLP Pesos chilenos +CNY Yuan Chinos +CNY yuan Chinos +CNY Yuan chinos +COP Pesos Colombianos +COP pesos Colombianos +COP Pesos colombianos +CRC Colones Costarricenses +CRC colones Costarricenses +CRC Colones costarricenses +CUP Pesos Cubanos +CUP pesos Cubanos +CUP Pesos cubanos +RD$ Pesos Dominicanos +RD$ pesos Dominicanos +RD$ Pesos dominicanos +GBP Libras Esterlinas +GBP libras Esterlinas +GBP Libras esterlinas +HKD Dólares De Hong Kong +HKD dólares de Hong Kong +HKD Dólares de hong kong +INR Rupias Indias +INR rupias Indias +INR Rupias indias +Mex$ Pesos Mexicanos +Mex$ pesos Mexicanos +Mex$ Pesos mexicanos +SVC Colones Salvadoreños +SVC colones Salvadoreños +SVC Colones salvadoreños +UYU Pesos Uruguayos +UYU pesos Uruguayos +UYU Pesos uruguayos +VES Bolívares Soberanos +VES bolívares Soberanos +VES Bolívares soberanos +BOP Pesos Bolivianos +BOP pesos Bolivianos +BOP Pesos bolivianos +CLE Escudos Chilenos +CLE escudos Chilenos +CLE Escudos chilenos +ECS Sucres Ecuatorianos +ECS sucres Ecuatorianos +ECS Sucres ecuatorianos +PEH Soles De Oro +PEH soles de Oro +PEH Soles de oro +VEB Bolívares Venezolanos +VEB bolívares Venezolanos +VEB Bolívares venezolanos +VEF Bolívares Fuertes +VEF bolívares Fuertes +VEF Bolívares fuertes \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular.tsv b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular.tsv index 879c8f1fe..3532bb2dd 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular.tsv @@ -1,6 +1,7 @@ € euro US$ dólar estadounidense US$ dólar americano +CAD$ dólar canadiense $ dólar $ peso ¥ yen diff --git a/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular_capitalized.tsv b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular_capitalized.tsv new file mode 100644 index 000000000..7e8ba2611 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/money/currency_major_singular_capitalized.tsv @@ -0,0 +1,76 @@ +US$ dólar Estadounidense +US$ Dólar Estadounidense +US$ Dólar estadounidense +US$ dólar Estadounidense +US$ dólar Americano +US$ Dólar Americano +US$ Dólar americano +CAD$ dólar Canadiense +CAD$ Dólar canadiense +CAD$ Dólar Canadiense +AR$ peso Argentino +AR$ Peso Argentino +AR$ Peso argentino +BRL real Brasileño +BRL Real Brasileño +BRL Real brasileño +CHF franco Suizo +CHF Franco Suizo +CHF Franco suizo +CLP Peso Chileno +CLP peso Chileno +CLP Peso chileno +CNY Yuan Chino +CNY yuan Chino +CNY Yuan chino +COP Peso Colombiano +COP peso Colombiano +COP Peso colombiano +CRC Colón Costarricense +CRC colón Costarricense +CRC Colón costarricense +CUP Peso Cubano +CUP peso Cubano +CUP Peso cubano +RD$ Peso Dominicano +RD$ peso Dominicano +RD$ Peso dominicano +GBP Libra Esterlina +GBP libra Esterlina +GBP Libra esterlina +HKD Dólar De Hong Kong +HKD dólar de Hong Kong +HKD Dólar de hong kong +INR Rupia India +INR rupia India +INR Rupia india +Mex$ Peso Mexicano +Mex$ peso Mexicano +Mex$ Peso mexicano +SVC Colón Salvadoreño +SVC colón Salvadoreño +SVC Colón salvadoreño +UYU Peso Uruguayo +UYU peso Uruguayo +UYU Peso uruguayo +VES Bolívar Soberano +VES bolívar Soberano +VES Bolívar soberano +BOP Peso Boliviano +BOP peso Boliviano +BOP Peso boliviano +CLE Escudo Chileno +CLE escudo Chileno +CLE Escudo chileno +ECS Sucre Ecuatoriano +ECS sucre Ecuatoriano +ECS Sucre ecuatoriano +PEH Sol De Oro +PEH sol de Oro +PEH Sol de oro +VEB Bolívar Venezolano +VEB bolívar Venezolano +VEB Bolívar venezolano +VEF Bolívar Fuerte +VEF bolívar Fuerte +VEF Bolívar fuerte \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/ordinals/digit_capitalized.tsv b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/digit_capitalized.tsv new file mode 100644 index 000000000..459ea85e1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/digit_capitalized.tsv @@ -0,0 +1,22 @@ +Primero uno +Primera uno +Primer uno +Segundo dos +Segunda dos +Tercero tres +Tercera tres +Tercer tres +Cuarto cuatro +Cuarta cuatro +Quinto cinco +Quinta cinco +Sexto seis +Sexta seis +Séptimo siete +Séptima siete +Sétimo siete +Sétima siete +Octavo ocho +Octava ocho +Noveno nueve +Novena nueve diff --git a/nemo_text_processing/inverse_text_normalization/es/data/ordinals/hundreds_capitalized.tsv b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/hundreds_capitalized.tsv new file mode 100644 index 000000000..0172f8f63 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/hundreds_capitalized.tsv @@ -0,0 +1,18 @@ +Centésimo ciento +Centésima ciento +Ducentésimo doscientos +Ducentésima doscientos +Tricentésimo trescientos +Tricentésima trescientos +Cuadringentésimo cuatrocientos +Cuadringentésima cuatrocientos +Quingentésimo quinientos +Quingentésima quinientos +Sexcentésimo seiscientos +Sexcentésima seiscientos +Septingentésimo setecientos +Septingentésima setecientos +Octingentésimo ochocientos +Octingentésima ochocientos +Noningentésimo novecientos +Noningentésima novecientos diff --git a/nemo_text_processing/inverse_text_normalization/es/data/ordinals/teen_capitalized.tsv b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/teen_capitalized.tsv new file mode 100644 index 000000000..80f012ba5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/teen_capitalized.tsv @@ -0,0 +1,60 @@ +Décimo diez +Décima diez +Decimoprimero once +Decimoprimera once +Decimoprimer once +Décimo Primero once +Décima Primera once +Décimo Primera once +Décimo Primer once +Undécimo once +Undécima once +Decimosegundo doce +Decimosegunda doce +Décimo Segundo doce +Décima Segunda doce +Décimo Segunda doce +Duodécimo doce +Duodécima doce +Decimotercero trece +Decimotercera trece +Decimotercer trece +Décimo Tercero trece +Décima Tercera trece +Décimo Tercera trece +Décimo Tercer trece +Decimocuarto catorce +Decimocuarta catorce +Décimo Cuarto catorce +Décima Cuarta catorce +Décimo Cuarta catorce +Decimoquinto quince +Decimoquinta quince +Décimo Quinto quince +Décima Quinta quince +Décimo Quinta quince +Decimosexto dieciséis +Decimosexta dieciséis +Décimo Sexto dieciséis +Décima Sexta dieciséis +Décimo Sexta dieciséis +Decimoséptimo diecisiete +Decimoséptima diecisiete +Décimo Séptimo diecisiete +Décima Séptima diecisiete +Décimo Séptima diecisiete +Décimo Sétimo diecisiete +Décimo Sétima diecisiete +Décima Sétima diecisiete +Decimosétimo diecisiete +Decimosétima diecisiete +Decimoctavo dieciocho +Decimoctava dieciocho +Décimo Octavo dieciocho +Décima Octava dieciocho +Décimo Octava dieciocho +Decimonoveno diecinueve +Decimonovena diecinueve +Décimo Noveno diecinueve +Décima Novena diecinueve +Décimo Novena diecinueve diff --git a/nemo_text_processing/inverse_text_normalization/es/data/ordinals/ties_capitalized.tsv b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/ties_capitalized.tsv new file mode 100644 index 000000000..58e0eff28 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/ties_capitalized.tsv @@ -0,0 +1,15 @@ +Vigésimo veinte +Vigésima veinte +Trigésimo treinta +Cuadragésimo cuarenta +Cuadragésima cuarenta +Quincuagésimo cincuenta +Quincuagésima cincuenta +Sexagésimo sesenta +Sexagésima sesenta +Septuagésimo setenta +Septuagésima setenta +Octogésimo ochenta +Octogésima ochenta +Nonagésimo noventa +Nonagésima noventa diff --git a/nemo_text_processing/inverse_text_normalization/es/data/ordinals/twenties_capitalized.tsv b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/twenties_capitalized.tsv new file mode 100644 index 000000000..40e73e815 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/ordinals/twenties_capitalized.tsv @@ -0,0 +1,50 @@ +Vigesimoprimero veintiuno +Vigesimoprimera veintiuno +Vigesimoprimer veintiuno +Vigésimo Primero veintiuno +Vigésimo Primera veintiuno +Vigésima Primera veintiuno +Vigésimo Primer veintiuno +Vigesimosegundo veintidós +Vigesimosegunda veintidós +Vigésimo Segundo veintidós +Vigésimo Segunda veintidós +Vigésima Segunda veintidós +Vigesimotercero veintitrés +Vigesimotercera veintitrés +Vigesimotercer veintitrés +Vigésimo Tercero veintitrés +Vigésimo Tercera veintitrés +Vigésima Tercera veintitrés +Vigésimo Tercer veintitrés +Vigesimocuarto veinticuatro +Vigesimocuarta veinticuatro +Vigésimo Cuarto veinticuatro +Vigésimo Cuarta veinticuatro +Vigésima Cuarta veinticuatro +Vigesimoquinto veinticinco +Vigesimoquinta veinticinco +Vigésimo Quinto veinticinco +Vigésimo Quinta veinticinco +Vigésima Quinta veinticinco +Vigesimosexto veintiséis +Vigesimosexta veintiséis +Vigésimo Sexto veintiséis +Vigésimo Sexta veintiséis +Vigésima Sexta veintiséis +Vigesimoséptimo veintisiete +Vigesimoséptima veintisiete +Vigésimo Séptimo veintisiete +Vigésimo Séptima veintisiete +Vigésima Séptima veintisiete +Vigesimoctavo veintiocho +Vigesimoctava veintiocho +Vigesimooctavo veintiocho +Vigesimooctava veintiocho +Vigésimo Octavo veintiocho +Vigésimo Octava veintiocho +Vigésima Octava veintiocho +Vigesimonoveno veintinueve +Vigesimonovena veintinueve +Vigésimo Noveno veintinueve +Vigésimo Novena veintinueve diff --git a/nemo_text_processing/inverse_text_normalization/es/data/roman/digit.tsv b/nemo_text_processing/inverse_text_normalization/es/data/roman/digit.tsv index 0610b4a54..e5fde2cc6 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/roman/digit.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/roman/digit.tsv @@ -1,9 +1,9 @@ -i 1 -ii 2 -iii 3 -iv 4 -v 5 -vi 6 -vii 7 -viii 8 -ix 9 \ No newline at end of file +I 1 +II 2 +III 3 +IV 4 +V 5 +VI 6 +VII 7 +VIII 8 +IX 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/roman/hundreds.tsv b/nemo_text_processing/inverse_text_normalization/es/data/roman/hundreds.tsv index cdbdb6814..5e04779be 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/roman/hundreds.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/roman/hundreds.tsv @@ -1,9 +1,9 @@ -c 1 -cc 2 -ccc 3 -cd 4 -d 5 -dc 6 -dcc 7 -dccc 8 -cm 9 \ No newline at end of file +C 1 +CC 2 +CCC 3 +CD 4 +D 5 +DC 6 +DCC 7 +DCCC 8 +CM 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/roman/thousands.tsv b/nemo_text_processing/inverse_text_normalization/es/data/roman/thousands.tsv index 19e96b9c6..164689802 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/roman/thousands.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/roman/thousands.tsv @@ -1,3 +1,3 @@ -m 1 -mm 2 -mmm 3 +M 1 +MM 2 +MMM 3 diff --git a/nemo_text_processing/inverse_text_normalization/es/data/roman/ties.tsv b/nemo_text_processing/inverse_text_normalization/es/data/roman/ties.tsv index ac043aa14..445773d91 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/roman/ties.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/roman/ties.tsv @@ -1,9 +1,9 @@ -x 1 -xx 2 -xxx 3 -xl 4 -l 5 -lx 6 -lxx 7 -lxxx 8 -xc 9 \ No newline at end of file +X 1 +XX 2 +XXX 3 +XL 4 +L 5 +LX 6 +LXX 7 +LXXX 8 +XC 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/time/time_suffix_cased.tsv b/nemo_text_processing/inverse_text_normalization/es/data/time/time_suffix_cased.tsv new file mode 100644 index 000000000..b04bd5193 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/time/time_suffix_cased.tsv @@ -0,0 +1,15 @@ +Peme P.M. +Pe Eme P.M. +P M P.M. +PM P.M. +P.M. +p.M P.M. +Ame A.M +A Eme A.M +AM A.M +A.M +A.M A.M +A M A.M +de la tarde P.M. +de la noche P.M. +de la mañana A.M diff --git a/nemo_text_processing/inverse_text_normalization/es/data/time/time_zone.tsv b/nemo_text_processing/inverse_text_normalization/es/data/time/time_zone.tsv index 6c86a3e4e..55f0297b0 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/time/time_zone.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/time/time_zone.tsv @@ -1,42 +1,42 @@ -utc u t c -cst c s t -cet c e t -pst p s t -est e s t -mdt m d t -mst m s t -pt p t -et e t -mt m t -gmt g m t -adt hora de verano del atlántico -amt hora estándar del amazonas -art hora estándar de argentina -ast hora estándar del atlántico -bot hora de bolivia -brt hora estándar de brasilia -clst hora de verano de chile -clt hora estándar de chile -cot hora estándar de colombia -east hora estándar de la isla de pascua -ect hora de ecuador -eeast hora de verano de la isla de pascua -eest hora de verano de europa oriental -eet hora estándar de europa oriental -fkst hora de verano de las malvinas -fnt hora estándar de fernando de noronha -galt hora de galápagos -gft hora de la guayana francesa -gyt hora de guyana -hkt hora estándar de hong kong -jst hora estándar de japón -kst hora estándar de corea -pet hora estándar de perú -pyst hora de verano de paraguay -pyt hora estándar de paraguay -sgt hora de singapur -uyst hora de verano de uruguay -uyt hora de uruguay -vet hora de venezuela -west hora de verano de europa oriental -wet hora estándar de europa oriental +UTC u t c +CST c s t +CET c e t +PST p s t +EST e s t +MDT m d t +MST m s t +PT p t +ET e t +MT m t +GMT g m t +ADT hora de verano del atlántico +AMT hora estándar del amazonas +ART hora estándar de argentina +AST hora estándar del atlántico +BOT hora de bolivia +BRT hora estándar de brasilia +CLST hora de verano de chile +CLT hora estándar de chile +COT hora estándar de colombia +EAST hora estándar de la isla de pascua +ECT hora de ecuador +EEAST hora de verano de la isla de pascua +EEST hora de verano de europa oriental +EET hora estándar de europa oriental +FKST hora de verano de las malvinas +FNT hora estándar de fernando de noronha +GALT hora de galápagos +GFT hora de la guayana francesa +GYT hora de guyana +HKT hora estándar de hong kong +JST hora estándar de japón +KST hora estándar de corea +PET hora estándar de perú +PYST hora de verano de paraguay +PYT hora estándar de paraguay +SGT hora de singapur +UYST hora de verano de uruguay +UYT hora de uruguay +VET hora de venezuela +WEST hora de verano de europa oriental +WET hora estándar de europa oriental \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/time/time_zone_cased.tsv b/nemo_text_processing/inverse_text_normalization/es/data/time/time_zone_cased.tsv new file mode 100644 index 000000000..e635698d3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/es/data/time/time_zone_cased.tsv @@ -0,0 +1,42 @@ +UTC U T C +CST C S T +CET C E T +PST P S T +EST E S T +MDT M D T +MST M S T +PT P T +ET E T +MT M T +GMT G M T +ADT Hora de Verano del Atlántico +AMT Hora Estándar del Amazonas +ART Hora Estándar de Argentina +AST Hora Estándar del Atlántico +BOT Hora de Bolivia +BRT Hora Estándar de Brasilia +CLST Hora de Verano de Chile +CLT Hora Estándar de Chile +COT Hora Estándar de Colombia +EAST Hora Estándar de la Isla de Pascua +ECT Hora de Ecuador +EEAST Hora de Verano de la Isla de Pascua +EEST Hora de Verano de Europa Oriental +EET Hora Estándar de Europa Oriental +FKST Hora de Verano de Las Malvinas +FNT Hora Estándar de Fernando de Noronha +GALT Hora de Galápagos +GFT Hora de la Guayana Francesa +GYT Hora de Guyana +HKT Hora Estándar de Hong Kong +JST Hora Estándar de Japón +KST Hora Estándar de Corea +PET Hora Estándar de Perú +PYST Hora de Verano de Paraguay +PYT Hora Estándar de Paraguay +SGT Hora de Singapur +UYST Hora de Verano de Uruguay +UYT Hora de Uruguay +VET Hora de Venezuela +WEST Hora de Verano de Europa Oriental +WET Hora Estándar de Europa Oriental \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/data/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/es/data/whitelist.tsv index 60253820a..3973c1a0c 100644 --- a/nemo_text_processing/inverse_text_normalization/es/data/whitelist.tsv +++ b/nemo_text_processing/inverse_text_normalization/es/data/whitelist.tsv @@ -1,16 +1,47 @@ -ud. usted -uds. ustedes -vd. vosotros -vds. vosotros -dr. doctor -dra. doctora -d. don -da. doña -ee. uu. estados unidos +Ud. usted +Uds. ustedes +Vd. vosotros +Vds. vosotros +Dr. doctor +Dra. doctora +D. don +Da. doña +EE. UU. estados unidos p.ej. por ejemplo -prof. profesor -profa. profesora -sr. señor -sra. señora -srta. señorita +Prof. profesor +Profa. profesora +Sr. señor +Sra. señora +Sres. señores +Srta. señorita etc. etcétera +TXT t x t +TXT T x t +TXT T X T +GPS g p s +GPS G p s +GPS G P S +DNI d n i +DNI D n i +DNI D N I +ISSN I S S N +ISSN I s s n +ISSN i s s n +SMS S M S +SMS S m s +SMS s m s +ISBN I S B N +ISBN I s b n +ISBN i s b n +URL U R L +URL U r l +URL u r l +RPG R P G +RPG R p g +RPG r p g +ADN A D N +ADN A d n +ADN a d n +JPG J P G +JPG J p g +JPG j p g \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py index 085b6bff1..2f62d589d 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py @@ -15,9 +15,17 @@ import pynini from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, NEMO_SPACE, GraphFst, delete_space +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, + NEMO_DIGIT, + NEMO_SPACE, + GraphFst, + capitalized_input_graph, + delete_space, +) +from nemo_text_processing.text_normalization.es.graph_utils import ES_MINUS class CardinalFst(GraphFst): @@ -35,10 +43,15 @@ class CardinalFst(GraphFst): inside cardinal numbers). e.g. "mil y una" -> cardinal { integer: "1001"} e.g. "ciento y una" -> cardinal { integer: "101"} + + Args: + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self): + def __init__(self, input_case: str = INPUT_LOWER_CASED): super().__init__(name="cardinal", kind="classify") + self.input_case = input_case + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) @@ -47,7 +60,7 @@ def __init__(self): graph_hundreds = pynini.string_file(get_abs_path("data/numbers/hundreds.tsv")) full_graph_ties = (graph_ties | pynutil.insert("0")) + ( - (delete_space + pynutil.delete("y") + delete_space + graph_digit) | pynutil.insert("0") + (delete_space + self.delete_word("y") + delete_space + graph_digit) | pynutil.insert("0") ) graph_hundred_component = graph_hundreds | pynutil.insert("0") @@ -61,27 +74,27 @@ def __init__(self): ) self.graph_hundred_component_at_least_one_none_zero_digit = ( graph_hundred_component_at_least_one_none_zero_digit - ) + ).optimize() graph_thousands = pynini.union( - graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("mil"), - pynutil.insert("001") + pynutil.delete("mil"), # because we say 'mil', not 'un mil' + graph_hundred_component_at_least_one_none_zero_digit + delete_space + self.delete_word("mil"), + pynutil.insert("001") + self.delete_word("mil"), # because we say 'mil', not 'un mil' pynutil.insert("000", weight=0.1), ) graph_millones = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space - + (pynutil.delete("millones") | pynutil.delete("millón")), - pynutil.insert("000") + pynutil.delete("millones"), # to allow for 'mil millones' + + (self.delete_word("millones") | self.delete_word("millón")), + pynutil.insert("000") + self.delete_word("millones"), # to allow for 'mil millones' ) graph_mil_millones = pynini.union( - graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("mil"), - pynutil.insert("001") + pynutil.delete("mil"), # because we say 'mil', not 'un mil' + graph_hundred_component_at_least_one_none_zero_digit + delete_space + self.delete_word("mil"), + pynutil.insert("001") + self.delete_word("mil"), # because we say 'mil', not 'un mil' ) graph_mil_millones += delete_space + ( - graph_millones | pynutil.insert("000") + pynutil.delete("millones") + graph_millones | pynutil.insert("000") + self.delete_word("millones") ) # allow for 'mil millones' graph_mil_millones |= pynutil.insert("000000", weight=0.1) @@ -89,36 +102,36 @@ def __init__(self): graph_millardo = ( graph_hundred_component_at_least_one_none_zero_digit + delete_space - + (pynutil.delete("millardo") | pynutil.delete("millardos")) + + (self.delete_word("millardo") | self.delete_word("millardos")) ) graph_billones = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space - + (pynutil.delete("billones") | pynutil.delete("billón")), + + (self.delete_word("billones") | self.delete_word("billón")), ) graph_mil_billones = pynini.union( - graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("mil"), - pynutil.insert("001") + pynutil.delete("mil"), # because we say 'mil', not 'un mil' + graph_hundred_component_at_least_one_none_zero_digit + delete_space + self.delete_word("mil"), + pynutil.insert("001") + self.delete_word("mil"), # because we say 'mil', not 'un mil' ) graph_mil_billones += delete_space + ( - graph_billones | pynutil.insert("000") + pynutil.delete("billones") + graph_billones | pynutil.insert("000") + self.delete_word("billones") ) # allow for 'mil billones' graph_mil_billones |= pynutil.insert("000000", weight=0.1) graph_trillones = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space - + (pynutil.delete("trillones") | pynutil.delete("trillón")), + + (self.delete_word("trillones") | self.delete_word("trillón")), ) graph_mil_trillones = pynini.union( - graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("mil"), - pynutil.insert("001") + pynutil.delete("mil"), # because we say 'mil', not 'un mil' + graph_hundred_component_at_least_one_none_zero_digit + delete_space + self.delete_word("mil"), + pynutil.insert("001") + self.delete_word("mil"), # because we say 'mil', not 'un mil' ) graph_mil_trillones += delete_space + ( - graph_trillones | pynutil.insert("000") + pynutil.delete("trillones") + graph_trillones | pynutil.insert("000") + self.delete_word("trillones") ) # allow for 'mil trillones' graph_mil_trillones |= pynutil.insert("000000", weight=0.1) @@ -144,12 +157,12 @@ def __init__(self): pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0" ) - self.graph_no_exception = graph + self.graph_no_exception = graph.optimize() # save self.numbers_up_to_thousand for use in DecimalFst digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3) - numbers_up_to_thousand = pynini.compose(graph, digits_up_to_thousand).optimize() - self.numbers_up_to_thousand = numbers_up_to_thousand + numbers_up_to_thousand = pynini.compose(self.graph_no_exception, digits_up_to_thousand).optimize() + self.numbers_up_to_thousand = numbers_up_to_thousand.optimize() # save self.numbers_up_to_million for use in DecimalFst digits_up_to_million = ( @@ -161,18 +174,35 @@ def __init__(self): | (NEMO_DIGIT ** 6) ) numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize() - self.numbers_up_to_million = numbers_up_to_million + self.numbers_up_to_million = numbers_up_to_million.optimize() + + if input_case == INPUT_CASED: + graph |= capitalized_input_graph(graph) + graph_digit |= capitalized_input_graph(graph_digit) + graph_zero |= capitalized_input_graph(graph_zero) + # graph_exception = capitalized_input_graph(graph_exception) + self.graph_no_exception |= capitalized_input_graph(self.graph_no_exception).optimize() + self.numbers_up_to_thousand |= capitalized_input_graph(self.numbers_up_to_thousand).optimize() # don't convert cardinals from zero to nine inclusive graph_exception = pynini.project(pynini.closure(NEMO_SPACE, 0, 1) + (graph_digit | graph_zero), 'input') - self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph + self.graph = ((pynini.project(graph, "input") - graph_exception.arcsort()) @ graph).optimize() optional_minus_graph = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("menos", "\"-\"") + NEMO_SPACE, 0, 1 + pynutil.insert("negative: ") + pynini.cross(ES_MINUS, "\"-\"") + NEMO_SPACE, 0, 1 ) final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() + + def delete_word(self, word: str): + """ Capitalizes word for `cased` input""" + delete_graph = pynutil.delete(word).optimize() + if self.input_case == INPUT_CASED: + if len(word) > 0: + delete_graph |= pynutil.delete(word[0].upper() + word[1:]) + + return delete_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/date.py b/nemo_text_processing/inverse_text_normalization/es/taggers/date.py index c4320825c..af96ee002 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/date.py @@ -14,10 +14,16 @@ import pynini from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.graph_utils import int_to_roman from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, + GraphFst, + capitalized_input_graph, + delete_extra_space, + delete_space, +) class DateFst(GraphFst): @@ -25,9 +31,13 @@ class DateFst(GraphFst): Finite state transducer for classifying date, e.g. primero de enero -> date { day: "1" month: "enero" } e.g. uno de enero -> date { day: "1" month: "enero" } + + Args: + cardinal: CardinalFst + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self, cardinal: GraphFst): + def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): super().__init__(name="date", kind="classify") graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) @@ -38,6 +48,10 @@ def __init__(self, cardinal: GraphFst): graph_month = pynini.string_file(get_abs_path("data/dates/months.tsv")) graph_suffix = pynini.string_file(get_abs_path("data/dates/year_suffix.tsv")).invert() + if input_case == INPUT_CASED: + graph_month |= pynini.string_file(get_abs_path("data/dates/months_cased.tsv")) + graph_suffix |= pynini.string_file(get_abs_path("data/dates/year_suffix_cased.tsv")).invert() + graph_1_to_100 = pynini.union( graph_digit, graph_twenties, @@ -68,5 +82,9 @@ def __init__(self, cardinal: GraphFst): final_graph = graph_dm | roman_centuries_graph | year_with_suffix_graph final_graph += pynutil.insert(" preserve_order: true") + + if input_case == INPUT_CASED: + final_graph |= capitalized_input_graph(final_graph) + final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py index b98c5b1e2..2b1949041 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/decimal.py @@ -14,17 +14,25 @@ import pynini from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, + MIN_NEG_WEIGHT, NEMO_DIGIT, + NEMO_SIGMA, + TO_LOWER, GraphFst, + capitalized_input_graph, delete_extra_space, delete_space, ) +from nemo_text_processing.text_normalization.es.graph_utils import ES_MINUS -def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_million: 'pynini.FstLike') -> 'pynini.FstLike': +def get_quantity( + decimal: 'pynini.FstLike', cardinal_up_to_million: 'pynini.FstLike', input_case: str = INPUT_LOWER_CASED +) -> 'pynini.FstLike': """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. one million -> integer_part: "1" quantity: "million" @@ -33,12 +41,13 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_million: 'pynini.FstL Args: decimal: decimal FST cardinal_up_to_million: cardinal FST + input_case: accepting either "lower_cased" or "cased" input. """ numbers = cardinal_up_to_million @ ( pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) ) - suffix = pynini.union( + suffix_labels = [ "millón", "millones", "millardo", @@ -49,7 +58,12 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_million: 'pynini.FstL "trillones", "cuatrillón", "cuatrillones", - ) + ] + suffix = pynini.union(*suffix_labels) + + if input_case == INPUT_CASED: + suffix |= pynini.union(*[x[0].upper() + x[1:] for x in suffix_labels]).optimize() + res = ( pynutil.insert("integer_part: \"") + numbers @@ -79,23 +93,28 @@ class DecimalFst(GraphFst): e.g. mil ochocientos veinticuatro millones -> decimal { negative: "false" integer_part: "1824" quantity: "millones" } Args: cardinal: CardinalFst + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self, cardinal: GraphFst): + def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): super().__init__(name="decimal", kind="classify") # number after decimal point can be any series of cardinals <1000, including 'zero' graph_decimal = cardinal.numbers_up_to_thousand graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal - self.graph = graph_decimal + self.graph = graph_decimal.optimize() # decimal point can be denoted by 'coma' or 'punto' decimal_point = pynini.cross("coma", "morphosyntactic_features: \",\"") decimal_point |= pynini.cross("punto", "morphosyntactic_features: \".\"") + if input_case == INPUT_CASED: + decimal_point |= pynini.cross("Coma", "morphosyntactic_features: \",\"") + decimal_point |= pynini.cross("Punto", "morphosyntactic_features: \".\"") + optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("menos", "\"true\"") + delete_extra_space, 0, 1 + pynutil.insert("negative: ") + pynini.cross(ES_MINUS, "\"true\"") + delete_extra_space, 0, 1 ) graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"") @@ -110,9 +129,21 @@ def __init__(self, cardinal: GraphFst): ) final_graph = optional_graph_negative + final_graph_wo_sign - self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( - final_graph_wo_sign, cardinal.numbers_up_to_million + self.final_graph_wo_negative = ( + final_graph_wo_sign + | get_quantity(final_graph_wo_sign, cardinal.numbers_up_to_million, input_case=input_case).optimize() ) - final_graph |= optional_graph_negative + get_quantity(final_graph_wo_sign, cardinal.numbers_up_to_million) + + # accept semiotic spans that start with a capital letter + self.final_graph_wo_negative |= pynutil.add_weight( + pynini.compose(TO_LOWER + NEMO_SIGMA, self.final_graph_wo_negative), MIN_NEG_WEIGHT + ).optimize() + + quantity_graph = get_quantity(final_graph_wo_sign, cardinal.numbers_up_to_million, input_case=input_case) + final_graph |= optional_graph_negative + quantity_graph + + if input_case == INPUT_CASED: + final_graph |= capitalized_input_graph(final_graph) + final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py index 98d8f60de..3bc6a8b6d 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py @@ -15,8 +15,18 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.en.utils import get_various_formats from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, GraphFst, insert_space +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, + MIN_POS_WEIGHT, + NEMO_ALPHA, + GraphFst, + capitalized_input_graph, + insert_space, +) +from nemo_text_processing.text_normalization.en.utils import load_labels class ElectronicFst(GraphFst): @@ -26,22 +36,36 @@ class ElectronicFst(GraphFst): and URLS (which get converted to a "protocol" field). e.g. c d f uno arroba a b c punto e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } } e.g. doble ve doble ve doble ve a b c punto e d u -> tokens { electronic { protocol: "www.abc.edu" } } + + Args: + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self): + def __init__(self, input_case: str = INPUT_LOWER_CASED): super().__init__(name="electronic", kind="classify") delete_extra_space = pynutil.delete(" ") - alpha_num = ( - NEMO_ALPHA - | pynini.string_file(get_abs_path("data/numbers/digit.tsv")) - | pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + + num = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) | pynini.string_file( + get_abs_path("data/numbers/zero.tsv") ) + if input_case == INPUT_CASED: + num = capitalized_input_graph(num) + + alpha_num = (NEMO_ALPHA | num).optimize() symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).invert() + if input_case == INPUT_CASED: + symbols = capitalized_input_graph(symbols) accepted_username = alpha_num | symbols - process_dot = pynini.cross("punto", ".") + dot = pynini.accep("punto") + if input_case == INPUT_CASED: + dot |= pynini.accep("Punto") + process_dot = pynini.cross(dot, ".") + alternative_dot = ( + pynini.closure(delete_extra_space, 0, 1) + pynini.accep(".") + pynini.closure(delete_extra_space, 0, 1) + ) username = ( pynutil.insert("username: \"") + alpha_num @@ -51,25 +75,60 @@ def __init__(self): + pynutil.insert("\"") ) single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num - server = single_alphanum | pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).invert() - domain = single_alphanum | pynini.string_file(get_abs_path("data/electronic/domain.tsv")).invert() + + server_names = pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).invert() + if input_case == INPUT_CASED: + server_names = capitalized_input_graph(server_names) + server = single_alphanum | server_names | pynini.closure(NEMO_ALPHA, 2) + + if input_case == INPUT_CASED: + domain = [] + # get domain formats + for d in load_labels(get_abs_path("data/electronic/domain.tsv")): + domain.extend(get_various_formats(d[0])) + domain = pynini.string_map(domain).optimize() + else: + domain = pynini.string_file(get_abs_path("data/electronic/domain.tsv")).invert() + + domain = pynutil.add_weight(single_alphanum, weight=-0.0001) | domain | pynini.closure(NEMO_ALPHA, 2) + domain_graph = ( pynutil.insert("domain: \"") + server - + delete_extra_space - + process_dot - + delete_extra_space + + ((delete_extra_space + process_dot + delete_extra_space) | alternative_dot) + domain + pynutil.insert("\"") ) - graph = ( - username + delete_extra_space + pynutil.delete("arroba") + insert_space + delete_extra_space + domain_graph - ) + + at = pynini.accep("arroba") + if input_case == INPUT_CASED: + at |= pynini.accep("Arroba") + + graph = username + delete_extra_space + pynutil.delete(at) + insert_space + delete_extra_space + domain_graph ############# url ### - protocol_end = pynini.cross(pynini.union("www", "w w w", "doble ve doble ve doble ve"), "www") - protocol_start = pynini.cross(pynini.union("http", "h t t p", "hache te te pe"), "http") - protocol_start |= pynini.cross(pynini.union("https", "h t t p s", "hache te te pe ese"), "https") + if input_case == INPUT_CASED: + spoken_ws = pynini.union( + "doble ve doble ve doble ve", "Doble Ve Doble Ve Doble Ve", "Doble ve doble ve doble ve" + ) + protocol_end = pynini.cross(pynini.union(*get_various_formats("www")) | spoken_ws, "www") + + spoken_http = pynini.union("hache te te pe", "Hache te te pe", "Hache Te Te Pe") + spoken_https = pynini.union("hache te te pe ese", "Hache te te pe ese", "Hache Te Te Pe Ese") + protocol_start = pynini.cross( + pynini.union(*get_various_formats("http")) | spoken_http, "http" + ) | pynini.cross(pynini.union(*get_various_formats("https")) | spoken_https, "https") + else: + protocol_end = pynutil.add_weight( + pynini.cross(pynini.union("www", "w w w", "doble ve doble ve doble ve"), "www"), MIN_POS_WEIGHT + ) + protocol_start = pynutil.add_weight( + pynini.cross(pynini.union("http", "h t t p", "hache te te pe"), "http"), MIN_POS_WEIGHT + ) + protocol_start |= pynutil.add_weight( + pynini.cross(pynini.union("https", "h t t p s", "hache te te pe ese"), "https"), MIN_POS_WEIGHT + ) + protocol_start += pynini.cross(" dos puntos barra barra ", "://") # e.g. .com, .es @@ -80,18 +139,35 @@ def __init__(self): + (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username) ) + protocol_default = ( + ( + (pynini.closure(delete_extra_space + accepted_username, 1) | server) + | pynutil.add_weight(pynini.closure(NEMO_ALPHA, 1), weight=0.001) + ) + + pynini.closure(ending, 1) + ).optimize() + protocol = ( pynini.closure(protocol_start, 0, 1) + protocol_end + delete_extra_space + process_dot + delete_extra_space - + (pynini.closure(delete_extra_space + accepted_username, 1) | server) - + pynini.closure(ending, 1) - ) + + protocol_default + ).optimize() + + if input_case == INPUT_CASED: + protocol |= ( + pynini.closure(protocol_start, 0, 1) + protocol_end + alternative_dot + protocol_default + ).optimize() + + protocol |= pynini.closure(protocol_end + delete_extra_space + process_dot, 0, 1) + protocol_default + protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert("\"") graph |= protocol - ######## + + if input_case == INPUT_CASED: + graph = capitalized_input_graph(graph, capitalized_graph_weight=MIN_POS_WEIGHT) final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py index 4fcf63706..a2b55026e 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/fraction.py @@ -15,9 +15,9 @@ import pynini from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, NEMO_SPACE, GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import INPUT_LOWER_CASED, NEMO_SIGMA, NEMO_SPACE, GraphFst +from nemo_text_processing.text_normalization.es.graph_utils import ES_MINUS class FractionFst(GraphFst): @@ -38,9 +38,10 @@ class FractionFst(GraphFst): Args: cardinal: CardinalFst ordinal: OrdinalFst + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self, cardinal: GraphFst, ordinal: GraphFst): + def __init__(self, cardinal: GraphFst, ordinal: GraphFst, input_case: str = INPUT_LOWER_CASED): super().__init__(name="fraction", kind="classify") cardinal_graph = cardinal.graph_no_exception @@ -75,7 +76,7 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst): # process negative fractions # e.g. "menos dos tercios" -> "fractions { negative: True numerator: "2" denominator: "3" }" - optional_negative_graph = pynini.closure(pynini.cross("menos", "negative: \"True\"") + NEMO_SPACE, 0, 1) + optional_negative_graph = pynini.closure(pynini.cross(ES_MINUS, "negative: \"True\"") + NEMO_SPACE, 0, 1) # process mixed fractions # e.g. "dos y dos tercios" -> "fractions { integer_part: "2" numerator: "2" denominator: "3" }" @@ -97,7 +98,7 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst): ) proper_fractions_with_medio = optional_negative_graph + proper_fractions_with_medio - self.proper_fractions_with_medio = self.add_tokens(proper_fractions_with_medio) + self.proper_fractions_with_medio = self.add_tokens(proper_fractions_with_medio).optimize() graph = ( optional_negative_graph + optional_integer_part_graph + numerators_graph + NEMO_SPACE + denominators_graph diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py index 68770a05c..9d231bc25 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/measure.py @@ -14,16 +14,19 @@ import pynini from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, NEMO_ALPHA, NEMO_SIGMA, + TO_LOWER, GraphFst, convert_space, delete_extra_space, delete_space, ) +from nemo_text_processing.text_normalization.es.graph_utils import ES_MINUS class MeasureFst(GraphFst): @@ -34,9 +37,11 @@ class MeasureFst(GraphFst): Args: cardinal: CardinalFst decimal: DecimalFst + fraction: FractionFst + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst): + def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, input_case: str = INPUT_LOWER_CASED): super().__init__(name="measure", kind="classify") cardinal_graph = cardinal.graph_no_exception @@ -46,13 +51,19 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst): math_symbols = pynini.string_file(get_abs_path("data/measures/math_symbols.tsv")) equal_symbol = pynini.string_map([("es igual a", "="), ("igual a", "=")]) + # accept capital letters in units + casing_graph = pynini.closure(TO_LOWER | NEMO_SIGMA).optimize() + graph_unit_singular = pynini.string_file(get_abs_path("data/measures/measurements_singular.tsv")) graph_unit_singular = pynini.invert(graph_unit_singular) # singular -> abbr + graph_unit_singular = pynini.compose(casing_graph, graph_unit_singular).optimize() + graph_unit_plural = pynini.string_file(get_abs_path("data/measures/measurements_plural.tsv")) graph_unit_plural = pynini.invert(graph_unit_plural) # plural -> abbr + graph_unit_plural = pynini.compose(casing_graph, graph_unit_plural).optimize() optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("menos", "\"true\"") + delete_extra_space, 0, 1 + pynutil.insert("negative: ") + pynini.cross(ES_MINUS, "\"true\"") + delete_extra_space, 0, 1 ) unit_singular = convert_space(graph_unit_singular) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/money.py b/nemo_text_processing/inverse_text_normalization/es/taggers/money.py index 3caea6f41..a65d9bcd2 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/money.py @@ -17,9 +17,12 @@ from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, NEMO_DIGIT, NEMO_SIGMA, GraphFst, + capitalized_input_graph, convert_space, delete_extra_space, delete_space, @@ -35,9 +38,10 @@ class MoneyFst(GraphFst): Args: cardinal: CardinalFst decimal: DecimalFst + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst): + def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPUT_LOWER_CASED): super().__init__(name="money", kind="classify") # quantity, integer_part, fractional_part, currency @@ -54,6 +58,22 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): unit_minor_plural = pynini.string_file(get_abs_path("data/money/currency_minor_plural.tsv")) unit_minor_plural = pynini.invert(unit_minor_plural) + if input_case == INPUT_CASED: + unit_singular = capitalized_input_graph(unit_singular) + unit_singular_capitalized = pynini.string_file( + get_abs_path("data/money/currency_major_singular_capitalized.tsv") + ) + unit_singular |= pynini.invert(unit_singular_capitalized).optimize() + + unit_plural = capitalized_input_graph(unit_plural) + unit_plural_capitalized = pynini.string_file( + get_abs_path("data/money/currency_major_plural_capitalized.tsv") + ) + unit_plural |= pynini.invert(unit_plural_capitalized).optimize() + + unit_minor_singular = capitalized_input_graph(unit_minor_singular).optimize() + unit_minor_plural = capitalized_input_graph(unit_minor_plural).optimize() + graph_unit_singular = pynutil.insert("currency: \"") + convert_space(unit_singular) + pynutil.insert("\"") graph_unit_plural = pynutil.insert("currency: \"") + convert_space(unit_plural) + pynutil.insert("\"") @@ -66,22 +86,31 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) + one_graph = pynini.union("un", "una").optimize() + if input_case == INPUT_CASED: + one_graph |= pynini.union("Un", "Una").optimize() + # twelve dollars (and) fifty cents, zero cents cents_standalone = ( pynutil.insert("morphosyntactic_features: \",\"") # always use a comma in the decimal + insert_space + pynutil.insert("fractional_part: \"") + pynini.union( - pynutil.add_weight(((NEMO_SIGMA - "un") @ cardinal_graph), -0.7) @ add_leading_zero_to_double_digit + pynutil.add_weight(((NEMO_SIGMA - one_graph) @ cardinal_graph), -0.7) + @ add_leading_zero_to_double_digit + delete_space, - pynini.cross("un", "01") + delete_space, + pynini.cross(one_graph, "01") + delete_space, ) + pynutil.insert("\"") ) + and_graph = pynini.union("con", "y").optimize() + if input_case == INPUT_CASED: + and_graph |= pynini.union("Con", "Y").optimize() + optional_cents_standalone = pynini.closure( delete_space - + pynini.closure((pynutil.delete("con") | pynutil.delete('y')) + delete_space, 0, 1) + + pynini.closure(pynutil.delete(and_graph) + delete_space, 0, 1) + insert_space + cents_standalone + pynutil.delete(pynini.union(unit_minor_singular, unit_minor_plural)), @@ -96,7 +125,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + pynutil.insert("morphosyntactic_features: \",\"") # always use a comma in the decimal + insert_space + pynutil.insert("fractional_part: \"") - + pynini.closure(pynutil.delete("con") + delete_space, 0, 1) + + pynini.closure(pynutil.delete(pynini.union("con", "Con")) + delete_space, 0, 1) + pynutil.add_weight(cardinal_graph @ add_leading_zero_to_double_digit, -0.7) + pynutil.insert("\""), 0, @@ -105,7 +134,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): graph_integer = ( pynutil.insert("integer_part: \"") - + ((NEMO_SIGMA - "un" - "una") @ cardinal_graph) + + ((NEMO_SIGMA - one_graph) @ cardinal_graph) + pynutil.insert("\"") + delete_extra_space + graph_unit_plural @@ -113,7 +142,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) graph_integer |= ( pynutil.insert("integer_part: \"") - + (pynini.cross("un", "1") | pynini.cross("una", "1")) + + pynini.cross(one_graph, "1") + pynutil.insert("\"") + delete_extra_space + graph_unit_singular diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py index 9b4ffaac8..d03640742 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py @@ -14,9 +14,15 @@ import pynini from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst, delete_space +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, + NEMO_SIGMA, + GraphFst, + capitalized_input_graph, + delete_space, +) class OrdinalFst(GraphFst): @@ -34,9 +40,10 @@ class OrdinalFst(GraphFst): Args: cardinal: CardinalFst + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self, cardinal: GraphFst): + def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): super().__init__(name="ordinal", kind="classify") cardinal_graph = cardinal.graph_no_exception @@ -46,6 +53,13 @@ def __init__(self, cardinal: GraphFst): graph_ties = pynini.string_file(get_abs_path("data/ordinals/ties.tsv")) graph_hundreds = pynini.string_file(get_abs_path("data/ordinals/hundreds.tsv")) + if input_case == INPUT_CASED: + graph_digit |= pynini.string_file(get_abs_path("data/ordinals/digit_capitalized.tsv")).optimize() + graph_teens |= pynini.string_file(get_abs_path("data/ordinals/teen_capitalized.tsv")).optimize() + graph_twenties |= pynini.string_file(get_abs_path("data/ordinals/twenties_capitalized.tsv")).optimize() + graph_ties |= pynini.string_file(get_abs_path("data/ordinals/ties_capitalized.tsv")).optimize() + graph_hundreds |= pynini.string_file(get_abs_path("data/ordinals/hundreds_capitalized.tsv")).optimize() + full_graph_ties = graph_ties | (graph_ties + pynini.cross(" ", "y") + graph_digit) ordinal_graph_union = pynini.union(graph_digit, graph_teens, graph_twenties, full_graph_ties, graph_hundreds,) @@ -65,7 +79,7 @@ def __init__(self, cardinal: GraphFst): graph_a_suffix = (optional_numbers_in_front + ordinal_graph_a) @ cardinal_graph graph_er_suffix = (optional_numbers_in_front + ordinal_graph_er) @ cardinal_graph - self.graph_masc_num_no_exception = graph_o_suffix + self.graph_masc_num_no_exception = graph_o_suffix.optimize() # don't convert ordinals from one to nine inclusive graph_exception = pynini.project(pynini.union(graph_digit), 'input') @@ -73,6 +87,11 @@ def __init__(self, cardinal: GraphFst): graph_a_suffix = (pynini.project(graph_a_suffix, "input") - graph_exception.arcsort()) @ graph_a_suffix graph_er_suffix = (pynini.project(graph_er_suffix, "input") - graph_exception.arcsort()) @ graph_er_suffix + if input_case == INPUT_CASED: + graph_o_suffix = capitalized_input_graph(graph_o_suffix) + graph_a_suffix = capitalized_input_graph(graph_a_suffix) + graph_er_suffix = capitalized_input_graph(graph_er_suffix) + graph = ( pynutil.insert("integer: \"") + graph_o_suffix diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py index 5043443c4..1c0be2037 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py @@ -14,9 +14,15 @@ import pynini from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_space +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, + GraphFst, + capitalized_input_graph, + delete_space, +) +from nemo_text_processing.text_normalization.es.graph_utils import ES_PLUS class TelephoneFst(GraphFst): @@ -31,9 +37,12 @@ class TelephoneFst(GraphFst): "twelve thirty four" = "1234". (we ignore more complicated cases such as "three hundred and two" or "three nines"). + + Args: + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self): + def __init__(self, input_case: str = INPUT_LOWER_CASED): super().__init__(name="telephone", kind="classify") # create `single_digits` and `double_digits` graphs as these will be @@ -42,8 +51,16 @@ def __init__(self): graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_twenties = pynini.string_file(get_abs_path("data/numbers/twenties.tsv")) + graph_zero = pynini.cross("cero", "0") + + if input_case == INPUT_CASED: + graph_digit = capitalized_input_graph(graph_digit) + graph_ties = capitalized_input_graph(graph_ties) + graph_teen = capitalized_input_graph(graph_teen) + graph_twenties = capitalized_input_graph(graph_twenties) + graph_zero = pynini.cross(pynini.union("cero", "Cero"), "0").optimize() - single_digits = graph_digit.optimize() | pynini.cross("cero", "0") + single_digits = graph_digit.optimize() | graph_zero double_digits = pynini.union( graph_twenties, @@ -58,7 +75,7 @@ def __init__(self): digit_thrice = digit_twice + pynutil.delete(" ") + single_digits # accept `doble cero` -> `00` and `triple ocho` -> `888` - digit_words = pynini.union(graph_digit.optimize(), pynini.cross("cero", "0")).invert() + digit_words = pynini.union(graph_digit.optimize(), graph_zero).invert() doubled_digit = pynini.union( *[ @@ -112,14 +129,26 @@ def __init__(self): # 8-digit option eight_digit_graph = group_of_four + insert_separator + group_of_four + plus = pynini.accep("más") + if input_case == INPUT_CASED: + plus |= ES_PLUS + # optionally denormalize country codes optional_country_code = pynini.closure( - pynini.cross("más ", "+") + (single_digits | group_of_two | group_of_three) + insert_separator, 0, 1 + pynini.cross(plus, "+") + + delete_space + + (single_digits | group_of_two | group_of_three) + + insert_separator, + 0, + 1, ) + ext_phrase = pynini.accep(" extensión ") + if input_case == INPUT_CASED: + ext_phrase = pynini.union(" extensión ", " Extensión ") # optionally denormalize extensions optional_extension = pynini.closure( - pynini.cross(" extensión ", " ext. ") + (single_digits | group_of_two | group_of_three), 0, 1 + pynini.cross(ext_phrase, " ext. ") + (single_digits | group_of_two | group_of_three), 0, 1 ) number_part = ( @@ -131,5 +160,8 @@ def __init__(self): number_part = pynutil.insert("number_part: \"") + number_part + pynutil.insert("\"") graph = number_part + if input_case == INPUT_CASED: + graph |= capitalized_input_graph(graph) + final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py index a45432085..9d55f35a3 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/time.py @@ -15,15 +15,18 @@ import pynini from pynini.lib import pynutil - from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, GraphFst, + capitalized_input_graph, convert_space, delete_extra_space, delete_space, insert_space, ) +from nemo_text_processing.text_normalization.es.graph_utils import ES_MINUS, ES_PLUS class TimeFst(GraphFst): @@ -60,9 +63,12 @@ class TimeFst(GraphFst): so far the rules have not been added to the TimeFst tagger to process timezones (to keep the rules simple, and because timezones are not very often specified in Spanish.) + + Args: + input_case: accepting either "lower_cased" or "cased" input. """ - def __init__(self): + def __init__(self, input_case: str = INPUT_LOWER_CASED): super().__init__(name="time", kind="classify") suffix_graph = pynini.string_file(get_abs_path("data/time/time_suffix.tsv")) @@ -82,7 +88,7 @@ def __init__(self): graph_teen, (graph_ties + pynutil.insert("0")), (graph_ties + pynutil.delete(" y ") + graph_digit), - ) + ).optimize() # note that graph_hour will start from 2 hours # "1 o'clock" will be treated differently because it @@ -90,24 +96,51 @@ def __init__(self): digits_2_to_23 = [str(digits) for digits in range(2, 24)] digits_1_to_59 = [str(digits) for digits in range(1, 60)] - graph_1oclock = pynini.cross("la una", "la 1") - graph_hour = pynini.cross("las ", "las ") + graph_1_to_100 @ pynini.union(*digits_2_to_23) + one_o_clock = pynini.accep("la una") + article = pynini.accep("las ") + half = pynini.accep("media") + quarter = pynini.accep("cuarto") + and_graph = pynini.union("y", "con") + hours_word_graph = pynini.accep(" horas") + minutes_word_graph = pynini.union(" minuto", " minutos") + + if input_case == INPUT_CASED: + suffix_graph |= pynini.string_file(get_abs_path("data/time/time_suffix_cased.tsv")).optimize() + time_zones |= pynini.invert(pynini.string_file(get_abs_path("data/time/time_zone_cased.tsv"))).optimize() + graph_digit |= capitalized_input_graph(graph_digit).optimize() + graph_1_to_100 |= capitalized_input_graph(graph_1_to_100).optimize() + article |= pynini.accep("Las ").optimize() + half |= pynini.accep("Media").optimize() + quarter |= pynini.accep("Cuarto").optimize() + and_graph |= pynini.union("Y", "Con").optimize() + hours_word_graph |= pynini.accep(" Horas").optimize() + minutes_word_graph |= pynini.union(" Minuto", " Minutos").optimize() + + graph_one_o_clock = pynini.cross(one_o_clock, "la 1") + if input_case == INPUT_CASED: + graph_one_o_clock |= pynini.cross(pynini.accep("la Una"), "la 1") + one_o_clock_capitalized = pynini.union("La Una", "La una") + graph_one_o_clock |= pynini.cross(one_o_clock_capitalized, "La 1").optimize() + + graph_hour = article + graph_1_to_100 @ pynini.union(*digits_2_to_23) graph_minute = graph_1_to_100 @ pynini.union(*digits_1_to_59) - graph_minute_verbose = pynini.cross("media", "30") | pynini.cross("cuarto", "15") + graph_minute_verbose = pynini.cross(half, "30") | pynini.cross(quarter, "15") - final_graph_hour = pynutil.insert("hours: \"") + (graph_1oclock | graph_hour) + pynutil.insert("\"") + final_graph_hour = pynutil.insert("hours: \"") + (graph_one_o_clock | graph_hour) + pynutil.insert("\"") final_graph_minute = ( pynutil.insert("minutes: \"") - + pynini.closure((pynutil.delete("y") | pynutil.delete("con")) + delete_space, 0, 1) + + pynini.closure(pynutil.delete(and_graph) + delete_space, 0, 1) + (graph_minute | graph_minute_verbose) + + pynini.closure(pynutil.delete(minutes_word_graph), 0, 1) + + pynini.closure(pynutil.delete(hours_word_graph), 0, 1) + pynutil.insert("\"") - ) + ).optimize() # g m t más tres -> las 2:00 p.m. gmt+3 digits_1_to_23 = [str(digits) for digits in range(1, 24)] offset = graph_1_to_100 @ pynini.union(*digits_1_to_23) - sign = pynini.cross("más", "+") | pynini.cross("menos", "-") + sign = pynini.cross(ES_PLUS, "+") | pynini.cross(ES_MINUS, "-") full_offset = pynutil.delete(" ") + sign + pynutil.delete(" ") + offset graph_offset = pynini.closure(full_offset, 0, 1) graph_time_zones = pynini.accep(" ") + time_zones + graph_offset @@ -127,12 +160,14 @@ def __init__(self): ) # las nueve a eme (only convert on-the-hour times if they are followed by a suffix) - graph_1oclock_with_suffix = pynini.closure(pynini.accep("la "), 0, 1) + pynini.cross("una", "1") - graph_hour_with_suffix = pynini.closure(pynini.accep("las "), 0, 1) + graph_1_to_100 @ pynini.union( - *digits_2_to_23 + graph_one_o_clock_with_suffix = pynini.closure(pynini.union("la ", "La "), 0, 1) + pynini.cross( + pynini.union("una", "Una"), "1" ) + graph_hour_with_suffix = pynini.closure(article, 0, 1) + graph_1_to_100 @ pynini.union(*digits_2_to_23) final_graph_hour_with_suffix = ( - pynutil.insert("hours: \"") + (graph_1oclock_with_suffix | graph_hour_with_suffix) + pynutil.insert("\"") + pynutil.insert("hours: \"") + + (graph_one_o_clock_with_suffix | graph_hour_with_suffix) + + pynutil.insert("\"") ) graph_hsuffix = ( @@ -171,14 +206,14 @@ def __init__(self): + delete_extra_space + pynutil.insert("minutes: \"") + delete_space - + pynutil.delete("menos") + + pynutil.delete(ES_MINUS) + delete_space + pynini.union( - pynini.cross("cinco", "55"), - pynini.cross("diez", "50"), - pynini.cross("cuarto", "45"), - pynini.cross("veinte", "40"), - pynini.cross("veinticinco", "30"), + pynini.cross(pynini.union("cinco", "Cinco"), "55"), + pynini.cross(pynini.union("diez", "Diez"), "50"), + pynini.cross(pynini.union("cuarto", "Cuarto"), "45"), + pynini.cross(pynini.union("veinte", "Veinte"), "40"), + pynini.cross(pynini.union("veinticinco", "Veinticinco"), "35"), ) + pynutil.insert("\"") ) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py index bed7ad019..26483fb83 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py @@ -72,31 +72,33 @@ def __init__( else: logger.info(f"Creating ClassifyFst grammars.") - cardinal = CardinalFst() + cardinal = CardinalFst(input_case=input_case) cardinal_graph = cardinal.fst - ordinal = OrdinalFst(cardinal) + ordinal = OrdinalFst(cardinal, input_case=input_case) ordinal_graph = ordinal.fst - decimal = DecimalFst(cardinal) + decimal = DecimalFst(cardinal, input_case=input_case) decimal_graph = decimal.fst - fraction = FractionFst(cardinal, ordinal) + fraction = FractionFst(cardinal, ordinal, input_case=input_case) fraction_graph = fraction.fst - measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction).fst - date_graph = DateFst(cardinal).fst + measure_graph = MeasureFst( + cardinal=cardinal, decimal=decimal, fraction=fraction, input_case=input_case + ).fst + date_graph = DateFst(cardinal, input_case=input_case).fst word_graph = WordFst().fst - time_graph = TimeFst().fst - money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst - whitelist_graph = WhiteListFst(input_file=whitelist).fst + time_graph = TimeFst(input_case=input_case).fst + money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, input_case=input_case).fst + whitelist_graph = WhiteListFst(input_file=whitelist, input_case=input_case).fst punct_graph = PunctuationFst().fst - electronic_graph = ElectronicFst().fst - telephone_graph = TelephoneFst().fst + electronic_graph = ElectronicFst(input_case=input_case).fst + telephone_graph = TelephoneFst(input_case=input_case).fst classify = ( pynutil.add_weight(whitelist_graph, 1.01) - | pynutil.add_weight(time_graph, 1.1) + | pynutil.add_weight(time_graph, 1.08) | pynutil.add_weight(date_graph, 1.09) | pynutil.add_weight(decimal_graph, 1.09) | pynutil.add_weight(fraction_graph, 1.09) @@ -105,7 +107,7 @@ def __init__( | pynutil.add_weight(ordinal_graph, 1.6) | pynutil.add_weight(money_graph, 1.6) | pynutil.add_weight(telephone_graph, 1.6) - | pynutil.add_weight(electronic_graph, 1.6) + | pynutil.add_weight(electronic_graph, 2.96) | pynutil.add_weight(word_graph, 100) ) diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py index 7b9159724..55f10efc6 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py @@ -12,11 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + import pynini from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, + GraphFst, + convert_space, +) +from nemo_text_processing.text_normalization.en.utils import load_labels class WhiteListFst(GraphFst): @@ -28,16 +36,50 @@ class WhiteListFst(GraphFst): Whitelisted tokens are defined and loaded from "data/whitelist.tsv" (unless input_file specified). Args: + input_case: accepting either "lower_cased" or "cased" input. input_file: path to a file with whitelist replacements (each line of the file: written_form\tspoken_form\n), e.g. nemo_text_processing/inverse_text_normalization/es/data/whitelist.tsv """ - def __init__(self, input_file: str = None): + def __init__(self, input_case: str = INPUT_LOWER_CASED, input_file: str = None): super().__init__(name="whitelist", kind="classify") - if input_file: - whitelist = pynini.string_file(input_file).invert() - else: - whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert() + def get_whitelist_graph(input_file: str): + labels = load_labels(input_file) + + if input_case == INPUT_CASED: + additional_labels = [] + for written, spoken in labels: + written_capitalized = written[0].upper() + written[1:] + additional_labels.extend( + [ + [written_capitalized, spoken.capitalize()], # first letter capitalized + [ + written_capitalized, + spoken.upper().replace(" Y ", " y "), + ], # # add pairs with the all letters capitalized + ] + ) + + spoken_no_space = spoken.replace(" ", "") + # add abbreviations without spaces (both lower and upper case), i.e. "BMW" not "B M W" + if len(spoken) == (2 * len(spoken_no_space) - 1): + additional_labels.extend( + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]] + ) + + labels += additional_labels + + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + if input_file is None: + input_file = get_abs_path("data/whitelist.tsv") + + if not os.path.exists(input_file): + raise ValueError(f"Whitelist file {input_file} not found") + + whitelist = get_whitelist_graph(input_file) + graph = pynutil.insert("name: \"") + convert_space(whitelist) + pynutil.insert("\"") self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py index 70cb9349f..61b9e0b84 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/date.py @@ -15,6 +15,7 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.es.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, @@ -32,6 +33,9 @@ class DateFst(GraphFst): def __init__(self): super().__init__(name="date", kind="verbalize") + graph_month = pynini.string_file(get_abs_path("data/dates/months.tsv")) + graph_month |= pynini.string_file(get_abs_path("data/dates/months_cased.tsv")) + year = ( pynutil.delete("year:") + delete_space @@ -39,13 +43,7 @@ def __init__(self): + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) - month = ( - pynutil.delete("month:") - + delete_space - + pynutil.delete("\"") - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") - ) + month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + graph_month + pynutil.delete("\"") day = ( pynutil.delete("day:") + delete_space diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py index e2f185f56..44892b0be 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/time.py @@ -35,12 +35,14 @@ def __init__(self): super().__init__(name="time", kind="verbalize") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) - # hour may or may not include preposition ("la" or "las") + # hour may or may not include article ("la" or "las") + article = pynini.union("la ", "las ", "La ", "Las ") + hour = ( pynutil.delete("hours:") + delete_space + pynutil.delete("\"") - + pynini.closure(pynini.union("la ", "las "), 0, 1) + + pynini.closure(article, 0, 1) + pynini.closure(NEMO_DIGIT, 1) + pynutil.delete("\"") ) diff --git a/nemo_text_processing/inverse_text_normalization/es_en/data/es_whitelist.tsv b/nemo_text_processing/inverse_text_normalization/es_en/data/es_whitelist.tsv index 60253820a..cfa739c1a 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/data/es_whitelist.tsv +++ b/nemo_text_processing/inverse_text_normalization/es_en/data/es_whitelist.tsv @@ -1,16 +1,16 @@ -ud. usted -uds. ustedes -vd. vosotros -vds. vosotros -dr. doctor -dra. doctora -d. don -da. doña -ee. uu. estados unidos +Ud. usted +Uds. ustedes +Vd. vosotros +Vds. vosotros +Dr. doctor +Dra. doctora +D. don +Da. doña +EE. UU. estados unidos p.ej. por ejemplo -prof. profesor -profa. profesora -sr. señor -sra. señora -srta. señorita -etc. etcétera +Prof. profesor +Profa. profesora +Sr. señor +Sra. señora +Srta. señorita +etc. etcétera \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py index ebf2a2a2e..ebef11007 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/taggers/tokenize_and_classify.py @@ -90,27 +90,29 @@ def __init__( else: logger.info(f"Creating ClassifyFst grammars.") - cardinal = CardinalFst() + cardinal = CardinalFst(input_case=input_case) cardinal_graph = cardinal.fst - ordinal = OrdinalFst(cardinal) + ordinal = OrdinalFst(cardinal, input_case=input_case) ordinal_graph = ordinal.fst - decimal = DecimalFst(cardinal) + decimal = DecimalFst(cardinal, input_case=input_case) decimal_graph = decimal.fst - fraction = FractionFst(cardinal, ordinal) + fraction = FractionFst(cardinal, ordinal, input_case=input_case) fraction_graph = fraction.fst - measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction).fst - date_graph = DateFst(cardinal).fst + measure_graph = MeasureFst( + cardinal=cardinal, decimal=decimal, fraction=fraction, input_case=input_case + ).fst + date_graph = DateFst(cardinal, input_case=input_case).fst word_graph = WordFst().fst - time_graph = TimeFst().fst - money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst + time_graph = TimeFst(input_case=input_case).fst + money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, input_case=input_case).fst whitelist_graph = WhiteListFst(input_file=whitelist).fst punct_graph = PunctuationFst().fst - electronic_graph = ElectronicFst().fst - telephone_graph = TelephoneFst().fst + electronic_graph = ElectronicFst(input_case=input_case).fst + telephone_graph = TelephoneFst(input_case=input_case).fst en_cardinal = EnCardinalFst(input_case=input_case) en_cardinal_graph = en_cardinal.fst @@ -151,7 +153,7 @@ def __init__( | pynutil.add_weight(en_money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.6) | pynutil.add_weight(en_telephone_graph, 1.1) - | pynutil.add_weight(electronic_graph, 1.6) + | pynutil.add_weight(electronic_graph, 2.3) | pynutil.add_weight(en_electronic_graph, 1.1) | pynutil.add_weight(word_graph, 100) | pynutil.add_weight(en_word_graph, 120) diff --git a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py index fd0955994..483d083c8 100644 --- a/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/es_en/verbalizers/verbalize.py @@ -75,10 +75,12 @@ def __init__(self): en_date_graph = EnDateFst().fst en_whitelist_graph = EnWhiteListFst().fst en_telephone_graph = EnTelephoneFst().fst + en_time_graph = EnTimeFst().fst en_electronic_graph = EnElectronicFst().fst graph = ( - time_graph + en_time_graph + | pynutil.add_weight(time_graph, 1.1) | date_graph | pynutil.add_weight(en_date_graph, 1.1) | money_graph diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 69ac68762..d88e1a047 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/data_loader_utils.py b/nemo_text_processing/text_normalization/data_loader_utils.py index 47fdc4e6b..01a85ec10 100644 --- a/nemo_text_processing/text_normalization/data_loader_utils.py +++ b/nemo_text_processing/text_normalization/data_loader_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/es/graph_utils.py b/nemo_text_processing/text_normalization/es/graph_utils.py index 24ae0aee3..6dbe08417 100644 --- a/nemo_text_processing/text_normalization/es/graph_utils.py +++ b/nemo_text_processing/text_normalization/es/graph_utils.py @@ -40,6 +40,10 @@ fem_hundreds = hundreds @ pynini.cdrewrite(pynini.cross("ientos", "ientas"), "", "", NEMO_SIGMA) +ES_MINUS = pynini.union("menos", "Menos", "MENOS").optimize() +ES_PLUS = pynini.union("más", "Más", "MÁS").optimize() + + def strip_accent(fst: 'pynini.FstLike') -> 'pynini.FstLike': """ Converts all accented vowels to non-accented equivalents diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_cardinal_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_cardinal_cased.txt new file mode 100644 index 000000000..15514ae12 --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_cardinal_cased.txt @@ -0,0 +1,30 @@ +Doscientos cincuenta y uno~251 +Novecientos noventa y nueve millones novecientos noventa y nueve mil novecientos noventa y nueve~999999999 +Cero~Cero +Uno~Uno +una~una +dos~dos +Nueve~Nueve +Diez~10 +, uno~, uno +, diez~, 10 +Menos veintitrés~-23 +cien~100 +ciento uno~101 +ciento un~101 +ciento una~101 +mil y uno~1001 +Mil una~1001 +nueve billones setecientos ochenta y nueve mil trescientos ochenta y dos millones quinientos treinta y seis mil ciento treinta~9789382536130 +Doscientos cincuenta y cuatro~254 +ciento cuarenta y siete mil cuatrocientos cincuenta y uno~147451 +Un Millón ciento cincuenta y seis mil ciento setenta y tres~1156173 +Mil quinientos noventa y tres millones setenta y dos mil novecientos sesenta y uno~1593072961 +noventa y siete mil ochocientos ocho billones doscientos sesenta y cuatro mil setecientos setenta y dos millones setecientos noventa y dos mil cinco~97808264772792005 +diecisiete mil ochocientos cincuenta y cinco trillones treinta y seis mil seiscientos cincuenta y siete billones siete mil quinientos noventa y seis millones ciento diez mil novecientos cuarenta y nueve~17855036657007596110949 +diez mil diez billones diez millones cien mil diez~10010000010100010 +Menos veinticinco mil treinta y siete~-25037 +mil doscientos sesenta y cuatro billones trescientos un mil novecientos treinta y ocho millones ciento cuatro~1264301938000104 +menos sesenta~-60 +cuarenta y seis mil seiscientos sesenta y cuatro~46664 +sesenta~60 \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_date_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_date_cased.txt new file mode 100644 index 000000000..98bfd6fe3 --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_date_cased.txt @@ -0,0 +1,8 @@ +Primero De Enero~1 de Enero +Uno de enero~1 de Enero +el uno de Diciembre~el 1 de Diciembre +El primero de diciembre~El 1 de diciembre +Domingo Veintiséis De Octubre~Domingo 26 de Octubre +treinta y uno de diciembre de mil novecientos noventa y dos~31 de diciembre de 1992 +Siglo diecinueve~Siglo xix +doscientos tres antes de Cristo~203 A. C. \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_decimal_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_decimal_cased.txt new file mode 100644 index 000000000..81a91bb3a --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_decimal_cased.txt @@ -0,0 +1,6 @@ +Uno coma dos seis~1,26 +Menos uno coma dos seis~-1,26 +Uno Coma Veintiséis~1,26 +Cero coma Dos seis~0,26 +cero coma veintiséis~0,26 +tres coma ciento cuarenta y uno~3,141 \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_electronic_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_electronic_cased.txt new file mode 100644 index 000000000..2d3f26b9d --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_electronic_cased.txt @@ -0,0 +1,5 @@ +A punto B C Arroba G mail punto com~A.BC@gmail.com +c d f Arroba a b c Punto e d u~cdf@abc.edu +W W W Punto N vidia Punto com~www.nvidia.com +Doble ve doble ve doble ve punto a b c punto es barra e f g~www.abc.es/efg +Doble Ve Doble Ve Doble Ve Punto a b c Punto e s~www.abc.es \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_measure.txt index 02895142e..6b80918b5 100644 --- a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_measure.txt @@ -13,7 +13,7 @@ cuatro segundos~4 s cinco litros~5 l tres metros cúbicos~3 m³ dos kilómetros por hora~2 kph -diez grados farenheit~10 ° f +diez grados farenheit~10 ° F dos metros y medio~2 1/2 m tres quintos de metro~3/5 m menos tres y medio metros por hora~-3 1/2 m/h diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_measure_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_measure_cased.txt new file mode 100644 index 000000000..ad28add7a --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_measure_cased.txt @@ -0,0 +1,11 @@ +Doscientos metros~200 m +tres horas~3 h +una hora~1 h +Doscientos cuarenta y cinco Millas Por Hora~245 mph +Dos Kilos~2 kg +sesenta coma dos cuatro cero cero kilogramos~60,2400 kg +Menos sesenta coma veinticuatro cero cero kilogramos~-60,2400 kg +menos Ocho Coma Cinco Dos por ciento~-8,52 % +uno Porciento~1 % +tres centímetros~3 cm +dos más dos es igual a cuatro~2 + 2 = 4 \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_money_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_money_cased.txt new file mode 100644 index 000000000..a57e6065a --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_money_cased.txt @@ -0,0 +1,6 @@ +doce dólares y cinco centavos~$12,05 +Doce Dólares Y Cinco Céntimos~$12,05 +setenta y cinco Dólares sesenta y tres~$75,63 +Veintinueve dólares cincuenta centavos~$29,50 +Catorce millones quinientos mil Pesos mexicanos~Mex$14500000 +diez pesos Mexicanos~Mex$10 \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_ordinal_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_ordinal_cased.txt new file mode 100644 index 000000000..0dd13fd54 --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_ordinal_cased.txt @@ -0,0 +1,11 @@ +primero~primero +Tercera~Tercera +Primer~Primer +tercer~tercer +Décima~10.ª +undécimo~11.º +Decimoprimer~11.ᵉʳ +Décimo primer~11.ᵉʳ +Décima Primera~11.ª +(technically ungrammatical) décimo primera~(technically ungrammatical) 11.ª +decimotercero~13.º \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_telephone_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_telephone_cased.txt new file mode 100644 index 000000000..068867d68 --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_telephone_cased.txt @@ -0,0 +1,6 @@ +Uno dos tres uno dos tres cinco seis siete ocho~123-123-5678 +uno veintitrés uno veintitrés cincuenta y seis setenta y ocho~123-123-5678 +Uno Dos Tres Cuatro Cinco Seis Siete Ocho Nueve~123-456-789 +Triple tres uno dos tres cinco seis siete ocho~333-123-5678 +Más uno uno dos tres uno dos tres cinco seis siete ocho~+1-123-123-5678 +más cincuenta y cuatro uno dos tres uno dos tres cinco seis siete ocho Extensión doce~+54-123-123-5678 ext. 12 \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_time.txt index 8ea4b35f1..e74a63fda 100644 --- a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_time.txt @@ -22,4 +22,4 @@ cuarto para las cero~las 23:45 cuarto para las veinticuatro~las 23:45 diez para las doce~las 11:50 dos y media de la tarde~2:30 p.m. -la una de la tarde u t c más cuatro~la 1:00 p.m. utc+4 +la una de la tarde u t c más cuatro~la 1:00 p.m. UTC+4 diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_time_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_time_cased.txt new file mode 100644 index 000000000..ba450d79b --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_time_cased.txt @@ -0,0 +1,9 @@ +las dieciséis cincuenta~las 16:50 +la una~la una +Las dos~Las dos +Las tres personas~Las tres personas +Las Dos a eme~Las 2:00 a.m. +la una Pe Eme~la 1:00 P.M. +la una y diez~la 1:10 +la una y Diez a eme~la 1:10 a.m. +La Una Y Diez pe eme~La 1:10 p.m. \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_whitelist.txt index f142f8954..d6aa3211c 100644 --- a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_whitelist.txt +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_whitelist.txt @@ -1,5 +1,5 @@ -usted~ud. -ustedes~uds. -habla usted español~habla ud. español -hablan ustedes español~hablan uds. español -estados unidos~ee. uu. \ No newline at end of file +usted~Ud. +ustedes~Uds. +habla usted español~habla Ud. español +hablan ustedes español~hablan Uds. español +estados unidos~EE. UU. \ No newline at end of file diff --git a/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_word_cased.txt b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_word_cased.txt new file mode 100644 index 000000000..38681016d --- /dev/null +++ b/tests/nemo_text_processing/es/data_inverse_text_normalization/test_cases_word_cased.txt @@ -0,0 +1,11 @@ +~ +Yahoo!~Yahoo! +Veinte!~20 ! +X ~X +—~— +AAA~AAA +Aabach~Aabach +aabenraa~aabenraa +Aachen's~Aachen's +aadri~aadri +aaliyan's~aaliyan's \ No newline at end of file diff --git a/tests/nemo_text_processing/es/test_cardinal.py b/tests/nemo_text_processing/es/test_cardinal.py index cb53c517a..e1b57fca3 100644 --- a/tests/nemo_text_processing/es/test_cardinal.py +++ b/tests/nemo_text_processing/es/test_cardinal.py @@ -24,13 +24,27 @@ class TestCardinal: - inverse_normalizer = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + # inverse_normalizer = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_cardinal.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) + assert pred == expected + + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_cardinal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) diff --git a/tests/nemo_text_processing/es/test_date.py b/tests/nemo_text_processing/es/test_date.py index af1c3d96f..4b12236ee 100644 --- a/tests/nemo_text_processing/es/test_date.py +++ b/tests/nemo_text_processing/es/test_date.py @@ -24,6 +24,9 @@ class TestDate: inverse_normalizer = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_date.txt')) @pytest.mark.run_only_on('CPU') @@ -32,6 +35,16 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_cardinal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = NormalizerWithAudio( diff --git a/tests/nemo_text_processing/es/test_decimal.py b/tests/nemo_text_processing/es/test_decimal.py index 39edf7066..7467e45b2 100644 --- a/tests/nemo_text_processing/es/test_decimal.py +++ b/tests/nemo_text_processing/es/test_decimal.py @@ -24,6 +24,9 @@ class TestDecimal: inverse_normalizer = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_decimal.txt')) @pytest.mark.run_only_on('CPU') @@ -32,6 +35,16 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_decimal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( diff --git a/tests/nemo_text_processing/es/test_electronic.py b/tests/nemo_text_processing/es/test_electronic.py index d476b79b0..ae0e4530c 100644 --- a/tests/nemo_text_processing/es/test_electronic.py +++ b/tests/nemo_text_processing/es/test_electronic.py @@ -24,6 +24,9 @@ class TestElectronic: inverse_normalizer_es = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_electronic.txt')) @pytest.mark.run_only_on('CPU') @@ -32,6 +35,16 @@ def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) assert pred == expected + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_electronic_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( diff --git a/tests/nemo_text_processing/es/test_fraction.py b/tests/nemo_text_processing/es/test_fraction.py index a189f4689..d0e818726 100644 --- a/tests/nemo_text_processing/es/test_fraction.py +++ b/tests/nemo_text_processing/es/test_fraction.py @@ -16,6 +16,7 @@ import pytest from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio @@ -23,6 +24,21 @@ class TestFraction: + inverse_normalizer = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_fraction.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( diff --git a/tests/nemo_text_processing/es/test_measure.py b/tests/nemo_text_processing/es/test_measure.py index 2e644c8db..572c88d03 100644 --- a/tests/nemo_text_processing/es/test_measure.py +++ b/tests/nemo_text_processing/es/test_measure.py @@ -25,6 +25,9 @@ class TestMeasure: inverse_normalizer_es = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_measure.txt')) @pytest.mark.run_only_on('CPU') @@ -33,6 +36,16 @@ def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) assert pred == expected + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_measure_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( diff --git a/tests/nemo_text_processing/es/test_money.py b/tests/nemo_text_processing/es/test_money.py index a0ebb8313..acc1fea82 100644 --- a/tests/nemo_text_processing/es/test_money.py +++ b/tests/nemo_text_processing/es/test_money.py @@ -25,6 +25,9 @@ class TestMoney: inverse_normalizer = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_money.txt')) @pytest.mark.run_only_on('CPU') @@ -33,6 +36,16 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_money_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( diff --git a/tests/nemo_text_processing/es/test_ordinal.py b/tests/nemo_text_processing/es/test_ordinal.py index 41741f2de..e2cd7d4a2 100644 --- a/tests/nemo_text_processing/es/test_ordinal.py +++ b/tests/nemo_text_processing/es/test_ordinal.py @@ -25,6 +25,9 @@ class TestOrdinal: inverse_normalizer = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_ordinal.txt')) @pytest.mark.run_only_on('CPU') @@ -33,6 +36,16 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_ordinal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( NormalizerWithAudio(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) diff --git a/tests/nemo_text_processing/es/test_telephone.py b/tests/nemo_text_processing/es/test_telephone.py index 489eb0930..265f877f6 100644 --- a/tests/nemo_text_processing/es/test_telephone.py +++ b/tests/nemo_text_processing/es/test_telephone.py @@ -25,6 +25,9 @@ class TestTelephone: inverse_normalizer_es = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_telephone.txt')) @pytest.mark.run_only_on('CPU') @@ -33,6 +36,16 @@ def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) assert pred == expected + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_telephone_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( diff --git a/tests/nemo_text_processing/es/test_time.py b/tests/nemo_text_processing/es/test_time.py index a55fcba7f..8b958c508 100644 --- a/tests/nemo_text_processing/es/test_time.py +++ b/tests/nemo_text_processing/es/test_time.py @@ -24,6 +24,9 @@ class TestTime: inverse_normalizer_es = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_time.txt')) @pytest.mark.run_only_on('CPU') @@ -32,6 +35,16 @@ def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) assert pred == expected + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_time_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + normalizer = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio = ( diff --git a/tests/nemo_text_processing/es/test_word.py b/tests/nemo_text_processing/es/test_word.py index f23e83c05..11002ea99 100644 --- a/tests/nemo_text_processing/es/test_word.py +++ b/tests/nemo_text_processing/es/test_word.py @@ -24,6 +24,9 @@ class TestWord: inverse_normalizer_es = InverseNormalizer(lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_es_cased = InverseNormalizer( + lang='es', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_word.txt')) @pytest.mark.run_only_on('CPU') @@ -32,6 +35,16 @@ def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) assert pred == expected + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('es/data_inverse_text_normalization/test_cases_word_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_es_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + normalizer_es = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) normalizer_with_audio_es = ( NormalizerWithAudio(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_cardinal_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_cardinal_cased.txt new file mode 100644 index 000000000..d6fc7a85e --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_cardinal_cased.txt @@ -0,0 +1,60 @@ +Doscientos cincuenta y uno~251 +Novecientos noventa y nueve millones novecientos noventa y nueve mil novecientos noventa y nueve~999999999 +Cero~Cero +Uno~Uno +una~una +dos~dos +Nueve~Nueve +Diez~10 +, uno~, uno +, diez~, 10 +Menos veintitrés~-23 +cien~100 +ciento uno~101 +ciento un~101 +ciento una~101 +mil y uno~1001 +Mil una~1001 +nueve billones setecientos ochenta y nueve mil trescientos ochenta y dos millones quinientos treinta y seis mil ciento treinta~9789382536130 +Doscientos cincuenta y cuatro~254 +ciento cuarenta y siete mil cuatrocientos cincuenta y uno~147451 +Un Millón ciento cincuenta y seis mil ciento setenta y tres~1156173 +Mil quinientos noventa y tres millones setenta y dos mil novecientos sesenta y uno~1593072961 +noventa y siete mil ochocientos ocho billones doscientos sesenta y cuatro mil setecientos setenta y dos millones setecientos noventa y dos mil cinco~97808264772792005 +diecisiete mil ochocientos cincuenta y cinco trillones treinta y seis mil seiscientos cincuenta y siete billones siete mil quinientos noventa y seis millones ciento diez mil novecientos cuarenta y nueve~17855036657007596110949 +diez mil diez billones diez millones cien mil diez~10010000010100010 +Menos veinticinco mil treinta y siete~-25037 +mil doscientos sesenta y cuatro billones trescientos un mil novecientos treinta y ocho millones ciento cuatro~1264301938000104 +menos sesenta~-60 +cuarenta y seis mil seiscientos sesenta y cuatro~46664 +sesenta~60 +Nine trillion seven hundred eighty nine billion three hundred eighty two million five hundred thirty six thousand one hundred thirty~9789382536130 +Two hundred and fifty four~254 +One hundred forty seven thousand four hundred fifty one~147451 +One million one hundred fifty six thousand one hundred seventy three~1156173 +One billion five hundred ninety three million seventy two thousand nine hundred sixty one~1593072961 +Ninety seven quadrillion eight hundred eight trillion two hundred sixty four billion seven hundred seventy two million seven hundred ninety two thousand five~97808264772792005 +Seventeen sextillion eight hundred fifty five quintillion thirty six quadrillion six hundred fifty seven trillion seven billion five hundred ninety six million one hundred ten thousand nine hundred forty nine~17855036657007596110949 +Ten quadrillion ten trillion ten million one hundred thousand ten~10010000010100010 +Minus twenty five thousand thirty seven~-25037 +One quadrillion two hundred sixty four trillion three hundred one billion nine hundred thirty eight million one hundred four~1264301938000104 +Minus sixty~-60 +Forty six thousand six hundred sixty four~46664 +Sixty~60 +Zero~Zero +Two million three~2000003 +One thousand thirteen~1013 +One thousand one~1001 +One thousand one hundred~1100 +One thousand twenty six~1026 +One thousand one hundred twenty six~1126 +Eighteen million four hundred fifty thousand nine hundred ninety~18450990 +Eighteen million nine hundred forty thousand seven hundred twenty two~18940722 +Eighteen million six hundred ninety thousand nine hundred sixteen~18690916 +Eighteen thousand eight hundred eighty~18880 +Eleven hundred~1100 +Twenty one hundred~2100 +Twenty one hundred and eleven~2111 +Eleven hundred twenty one~1121 +Nineteen~19 +Twelve~Twelve \ No newline at end of file diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_date.txt index f961fbfbb..1147a12c7 100644 --- a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_date.txt @@ -13,7 +13,7 @@ two thousand and nine~2009 the twenty fifth of july twenty twelve~25 july 2012 the twenty fifth of july two thousand twelve~25 july 2012 the twenty second of july twenty twelve~22 july 2012 -the fifteenth of january~15 de january +the fifteenth of january~15 january the seventeenth of may twenty ten~17 may 2010 january first~january 1 july twenty second two thousand eight~july 22 2008 diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_date_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_date_cased.txt new file mode 100644 index 000000000..ab8358dd6 --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_date_cased.txt @@ -0,0 +1,20 @@ +Primero De Enero~1 de Enero +Uno de enero~1 de Enero +el uno de Diciembre~el 1 de Diciembre +El primero de diciembre~El 1 de diciembre +Domingo Veintiséis De Octubre~Domingo 26 de Octubre +treinta y uno de diciembre de mil novecientos noventa y dos~31 de diciembre de 1992 +Siglo diecinueve~Siglo xix +doscientos tres antes de Cristo~203 A. C. +January first~January 1 +July twenty second two thousand eight~July 22 2008 +June thirty~June 30 +July twenty fifth twenty twelve~July 25 2012 +Nineteen seventeen~1917 +Twenty twelve~2012 +March sixteen sixty five~March 1665 +Sixteen sixty five~1665 +July two thousand twelve~July 2012 +October nineteen oh five~October 1905 +July fifteen o six~July 1506 +The twenty fifth of july twenty twelve~25 july 2012 \ No newline at end of file diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_decimal_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_decimal_cased.txt new file mode 100644 index 000000000..17b91271d --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_decimal_cased.txt @@ -0,0 +1,19 @@ +Uno coma dos seis~1,26 +Menos uno coma dos seis~-1,26 +Uno Coma Veintiséis~1,26 +Cero coma Dos seis~0,26 +cero coma veintiséis~0,26 +tres coma ciento cuarenta y uno~3,141 +One point two five billion~1.25 billion +Thirteen billion~13 billion +Thirty billion~30 billion +Thirty Billion~30 Billion +Two thousand eight hundred five point eight seven three billion~2805.873 billion +Eighteen~18 +Eighteen point eight five~18.85 +Eighteen point five o~18.50 +Eighteen point five six~18.56 +Eighteen point nine~18.9 +Eighteen point o five~18.05 +Eighteen point one two~18.12 +Eighteen point o one~18.01 diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_electronic_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_electronic_cased.txt new file mode 100644 index 000000000..9131f4f94 --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_electronic_cased.txt @@ -0,0 +1,14 @@ +A punto B C Arroba G mail punto com~A.BC@gmail.com +c d f Arroba a b c Punto e d u~cdf@abc.edu +W W W Punto N vidia Punto com~www.nvidia.com +Doble ve doble ve doble ve punto a b c punto es barra e f g~www.abc.es/efg +Doble Ve Doble Ve Doble Ve Punto a b c Punto e s~www.abc.es +N vidia dot com~nvidia.com +Abc at gmail dot com~Abc@gmail.com +Athreed at gmail dot com~Athreed@gmail.com +Kore dot ai~Kore.ai +NVIDIA dot com~NVIDIA.com +NVIDIA dot COM~NVIDIA.COM +WWW.A B C at A B C dot com~WWW.A BC@ABC.com +W W W. A B C dot com~www.ABC.com +w w w . o u r d a i l y n e w s dot com . s m~www.ourdailynews.com . s m \ No newline at end of file diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_measure.txt index 0a9b90903..01278697f 100644 --- a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_measure.txt @@ -13,7 +13,7 @@ cuatro segundos~4 s cinco litros~5 l tres metros cúbicos~3 m³ dos kilómetros por hora~2 kph -diez grados farenheit~10 ° f +diez grados farenheit~10 ° F dos metros y medio~2 1/2 m tres quintos de metro~3/5 m menos tres y medio metros por hora~-3 1/2 m/h diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_measure_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_measure_cased.txt new file mode 100644 index 000000000..7073a7bf6 --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_measure_cased.txt @@ -0,0 +1,22 @@ +Doscientos metros~200 m +tres horas~3 h +una hora~1 h +Doscientos cuarenta y cinco Millas Por Hora~245 mph +Dos Kilos~2 kg +sesenta coma dos cuatro cero cero kilogramos~60,2400 kg +Menos sesenta coma veinticuatro cero cero kilogramos~-60,2400 kg +menos Ocho Coma Cinco Dos por ciento~-8,52 % +uno Porciento~1 % +tres centímetros~3 cm +dos más dos es igual a cuatro~2 + 2 = 4 +Eight point five megawatts~8.5 mW +Eight point five meters~8.5 m +Eight point five two percent~8.52 % +Eight point four four percent~8.44 % +one gigabit per second~1 gbps +nine gigabits per second~9 gbps +five degrees celsius~5 °C +seventy two degrees fahrenheit~72 °F +Seventy two Degrees Fahrenheit~72 °F +two hundred seventy three kelvin~273 K +Nine GigaBits per second~9 gbps \ No newline at end of file diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_money_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_money_cased.txt new file mode 100644 index 000000000..613a822a6 --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_money_cased.txt @@ -0,0 +1,16 @@ +doce dólares y cinco centavos~$12,05 +Doce Dólares Y Cinco Céntimos~$12,05 +setenta y cinco Dólares sesenta y tres~$75,63 +Veintinueve dólares cincuenta centavos~$29,50 +Catorce millones quinientos mil Pesos mexicanos~Mex$14500000 +diez pesos Mexicanos~Mex$10 +Two dollars~$2 +One cent~$0.01 +Four united states dollars and sixty nine cents~$4.69 +Seventy five dollars sixty three~$75.63 +Twenty nine dollars fifty cents~$29.50 +Eleven dollars and fifty one cents~$11.51 +Nine hundred ninety three dollars and ninety two cents~$993.92 +Four hundred sixty billion won~₩460 billion +Thirty billion yen~¥30 billion +Two point five billion dollars~$2.5 billion diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_ordinal_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_ordinal_cased.txt new file mode 100644 index 000000000..7b7df1ed7 --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_ordinal_cased.txt @@ -0,0 +1,21 @@ +primero~primero +Tercera~Tercera +Primer~Primer +tercer~tercer +Décima~10.ª +undécimo~11.º +Decimoprimer~11.ᵉʳ +Décimo primer~11.ᵉʳ +Décima Primera~11.ª +(technically ungrammatical) décimo primera~(technically ungrammatical) 11.ª +decimotercero~13.º +One hundredth~100th +Twenty five thousand one hundred eleventh~25111th +Second~2nd +Zeroth~0th +First~1st +Second~2nd +Third~3rd +Fourth~4th +Eleventh~11th +Twelfth~12th diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_telephone_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_telephone_cased.txt new file mode 100644 index 000000000..6b9613da0 --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_telephone_cased.txt @@ -0,0 +1,19 @@ +Uno dos tres uno dos tres cinco seis siete ocho~123-123-5678 +uno veintitrés uno veintitrés cincuenta y seis setenta y ocho~123-123-5678 +Uno Dos Tres Cuatro Cinco Seis Siete Ocho Nueve~123-456-789 +Triple tres uno dos tres cinco seis siete ocho~333-123-5678 +Más uno uno dos tres uno dos tres cinco seis siete ocho~+1-123-123-5678 +más cincuenta y cuatro uno dos tres uno dos tres cinco seis siete ocho Extensión doce~+54-123-123-5678 ext. 12 +One two three one two three five six seven eight~123-123-5678 +Plus nine one one two three one two three five six seven eight~+91 123-123-5678 +Plus forty four one two three one two three five six seven eight~+44 123-123-5678 +O two three one two three five six seven eight~023-123-5678 +Oh two three one two three five six seven eight~023-123-5678 +Double oh three one two three five six seven eight~003-123-5678 +Two two five dot double five dot o dot four o~225.55.0.40 +Two two five dot double five dot o dot forty five~225.55.0.45 +SSN is seven double nine one two three double one three~SSN is 799-12-3113 +Seven nine nine~799 +A b nine~Ab9 +A b c~A b c +Five w k r a three one~5wkra31 diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_time.txt index e12512e3a..17113ad98 100644 --- a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_time.txt @@ -21,23 +21,23 @@ las dos de la tarde~las 2:00 p.m. cuarto para las cero~las 23:45 cuarto para las veinticuatro~las 23:45 diez para las doce~las 11:50 -dos y media de la tarde~2:30 p.m. -la una de la tarde u t c más cuatro~la 1:00 p.m. utc+4 -eight oclock g m t~8:00 gmt -seven a m e s t~7:00 a.m. est -two p m~2:00 p.m. -two thirty~2:30 -three o'clock~3:00 -quarter past one~1:15 -half past three~3:30 -eight fifty one~8:51 -eight fifty two~8:52 -eight forty~8:40 -eight nineteen~8:19 -eight o six~8:06 -eight thirty eight~8:38 -eight thirty two~8:32 -eight twenty nine~8:29 +dos y media de la tarde~02:30 p.m. +la una de la tarde u t c más cuatro~la 1:00 p.m. UTC+4 +eight oclock g m t~08:00 gmt +seven a m e s t~07:00 a.m. est +two p m~02:00 p.m. +two thirty~02:30 +three o'clock~03:00 +quarter past one~01:15 +half past three~03:30 +eight fifty one~08:51 +eight fifty two~08:52 +eight forty~08:40 +eight nineteen~08:19 +eight o six~08:06 +eight thirty eight~08:38 +eight thirty two~08:32 +eight twenty nine~08:29 eleven fifty five p m~11:55 p.m. eleven fifty three p m~11:53 p.m. eleven forty a m~11:40 a.m. @@ -47,7 +47,7 @@ eleven forty six a m~11:46 a.m. eleven o six p m~11:06 p.m. eleven thirteen a m~11:13 a.m. half past twelve~12:30 -quarter past one~1:15 +quarter past one~01:15 quarter to one~12:45 quarter to twelve~11:45 set alarm at ten to eleven pm~set alarm at 10:50 p.m. diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_time_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_time_cased.txt new file mode 100644 index 000000000..875ff97d6 --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_time_cased.txt @@ -0,0 +1,19 @@ +las dieciséis cincuenta~las 16:50 +la una~la una +Las dos~Las dos +Las tres personas~Las tres personas +Las Dos a eme~Las 2:00 a.m. +la una Pe Eme~la 1:00 P.M. +la una y diez~la 1:10 +la una y Diez a eme~la 1:10 a.m. +La Una Y Diez pe eme~La 1:10 p.m. +Eight oclock g m t~08:00 gmt +Seven a m e s t~07:00 a.m. est +Two p m~02:00 p.m. +Seven A M E S T~07:00 A.M. EST +Two P M~02:00 P.M. +Two thirty~02:30 +Set alarm at ten to eleven pm~Set alarm at 10:50 p.m. +One min to one am~12:59 a.m. +eleven Forty six A M~11:46 A.M. +eleven forty six AM~11:46 A.M. \ No newline at end of file diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_whitelist.txt index 895fb52c2..90c024aa1 100644 --- a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_whitelist.txt +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_whitelist.txt @@ -1,8 +1,8 @@ -usted~ud. -ustedes~uds. -habla usted español~habla ud. español -hablan ustedes español~hablan uds. español -estados unidos~ee. uu. +usted~Ud. +ustedes~Uds. +habla usted español~habla Ud. español +hablan ustedes español~hablan Uds. español +estados unidos~EE. UU. doctor dao~dr. dao misses smith~mrs. smith mister dao~mr. dao diff --git a/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_word_cased.txt b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_word_cased.txt new file mode 100644 index 000000000..ebe0c0864 --- /dev/null +++ b/tests/nemo_text_processing/es_en/data_inverse_text_normalization/test_cases_word_cased.txt @@ -0,0 +1,17 @@ +~ +Yahoo!~Yahoo! +Veinte!~20 ! +X ~X +—~— +AAA~AAA +Aabach~Aabach +aabenraa~aabenraa +Aachen's~Aachen's +Aadri~Aadri +Aaliyan's~Aaliyan's +Aahar~Aahar +Aahh~Aahh +Aahperd~Aahperd +Aaibinterstate~Aaibinterstate +Aajab~Aajab +Aakasa~Aakasa \ No newline at end of file diff --git a/tests/nemo_text_processing/es_en/test_cardinal.py b/tests/nemo_text_processing/es_en/test_cardinal.py index d83c009d5..29fed2b30 100644 --- a/tests/nemo_text_processing/es_en/test_cardinal.py +++ b/tests/nemo_text_processing/es_en/test_cardinal.py @@ -21,8 +21,10 @@ class TestCardinal: - inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_cardinal.txt')) @pytest.mark.run_only_on('CPU') @@ -30,3 +32,10 @@ class TestCardinal: def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_cardinal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_date.py b/tests/nemo_text_processing/es_en/test_date.py index 0136b54d7..8368542d0 100644 --- a/tests/nemo_text_processing/es_en/test_date.py +++ b/tests/nemo_text_processing/es_en/test_date.py @@ -22,6 +22,9 @@ class TestDate: inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_date.txt')) @pytest.mark.run_only_on('CPU') @@ -29,3 +32,10 @@ class TestDate: def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_date_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_decimal.py b/tests/nemo_text_processing/es_en/test_decimal.py index d2fbae8c6..22a0e4d2c 100644 --- a/tests/nemo_text_processing/es_en/test_decimal.py +++ b/tests/nemo_text_processing/es_en/test_decimal.py @@ -22,6 +22,9 @@ class TestDecimal: inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_decimal.txt')) @pytest.mark.run_only_on('CPU') @@ -29,3 +32,10 @@ class TestDecimal: def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_decimal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_electronic.py b/tests/nemo_text_processing/es_en/test_electronic.py index 5dc91c639..90ab9e0d2 100644 --- a/tests/nemo_text_processing/es_en/test_electronic.py +++ b/tests/nemo_text_processing/es_en/test_electronic.py @@ -21,11 +21,21 @@ class TestElectronic: - inverse_normalizer_es = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_electronic.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm_es(self, test_input, expected): - pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_electronic_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_measure.py b/tests/nemo_text_processing/es_en/test_measure.py index 948f54db6..94fc46f1d 100644 --- a/tests/nemo_text_processing/es_en/test_measure.py +++ b/tests/nemo_text_processing/es_en/test_measure.py @@ -22,11 +22,21 @@ class TestMeasure: - inverse_normalizer_es = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_measure.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm_es(self, test_input, expected): - pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_measure_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_money.py b/tests/nemo_text_processing/es_en/test_money.py index 6b2496015..8dadbe3d1 100644 --- a/tests/nemo_text_processing/es_en/test_money.py +++ b/tests/nemo_text_processing/es_en/test_money.py @@ -23,10 +23,20 @@ class TestMoney: inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_money.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit - def test_denorm(self, test_input, expected): + def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_money_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_ordinal.py b/tests/nemo_text_processing/es_en/test_ordinal.py index bef676a3c..bfb3dfc84 100644 --- a/tests/nemo_text_processing/es_en/test_ordinal.py +++ b/tests/nemo_text_processing/es_en/test_ordinal.py @@ -23,10 +23,20 @@ class TestOrdinal: inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_ordinal.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit - def test_denorm(self, test_input, expected): + def test_denorm_es(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_ordinal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_telephone.py b/tests/nemo_text_processing/es_en/test_telephone.py index ec8dba594..112bf0949 100644 --- a/tests/nemo_text_processing/es_en/test_telephone.py +++ b/tests/nemo_text_processing/es_en/test_telephone.py @@ -22,11 +22,21 @@ class TestTelephone: - inverse_normalizer_es = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_telephone.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm_es(self, test_input, expected): - pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_telephone_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_time.py b/tests/nemo_text_processing/es_en/test_time.py index 1d76e5012..f1fc03945 100644 --- a/tests/nemo_text_processing/es_en/test_time.py +++ b/tests/nemo_text_processing/es_en/test_time.py @@ -21,11 +21,21 @@ class TestTime: - inverse_normalizer_es = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_time.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm_es(self, test_input, expected): - pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_time_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) assert pred == expected diff --git a/tests/nemo_text_processing/es_en/test_word.py b/tests/nemo_text_processing/es_en/test_word.py index 273089b90..b7b67a17b 100644 --- a/tests/nemo_text_processing/es_en/test_word.py +++ b/tests/nemo_text_processing/es_en/test_word.py @@ -21,11 +21,21 @@ class TestWord: - inverse_normalizer_es = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer = InverseNormalizer(lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='es_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) @parameterized.expand(parse_test_case_file('es_en/data_inverse_text_normalization/test_cases_word.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm_es(self, test_input, expected): - pred = self.inverse_normalizer_es.inverse_normalize(test_input, verbose=False) + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_word_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) assert pred == expected