diff --git a/Jenkinsfile b/Jenkinsfile index f9a225b27..6e392cf50 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-27-23-0' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-30-24-0' IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-26-23-0' HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' @@ -189,7 +189,7 @@ pipeline { } } - stage('L0: Create RU TN/ITN Grammars & SV & PT & ZH') { + stage('L0: Create RU TN/ITN Grammars & SV & PT') { when { anyOf { branch 'main' @@ -228,16 +228,6 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=pt --text="dez " --cache_dir ${PT_TN_CACHE}' } } - stage('L0: ZH TN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=zh --text="你" --cache_dir ${ZH_TN_CACHE}' - } - } - stage('L0: ZH ITN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=zh --text="二零零二年一月二十八日 " --cache_dir ${ZH_TN_CACHE}' - } - } } } @@ -267,9 +257,31 @@ pipeline { } } } + stage('L0: Create ZH TN/ITN Grammar') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('L0: ZH ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=zh --text="你" --cache_dir ${ZH_TN_CACHE}' + } + } + stage('L0: ZH TN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=zh --text="6" --cache_dir ${ZH_TN_CACHE}' + } + } + } + } // L1 Tests starts here + stage('L1: TN/ITN Tests CPU') { when { anyOf { diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py index 13e8ab6d0..de1a7a28c 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -22,6 +22,8 @@ from pynini.export import export from pynini.lib import byte, pynutil, utf8 +from nemo_text_processing.inverse_text_normalization.zh.utils import load_labels + NEMO_CHAR = utf8.VALID_UTF8_CHAR NEMO_DIGIT = byte.DIGIT NEMO_HEX = pynini.union(*string.hexdigits).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/utils.py b/nemo_text_processing/inverse_text_normalization/zh/utils.py index d63a1b2f7..92336fe0f 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/utils.py @@ -60,3 +60,17 @@ def get_various_formats(text: str) -> List[str]: result.append(t.upper()) result.append(t.capitalize()) return result + + +def load_labels(abs_path): + """ + loads relative path file as dictionary + + Args: + abs_path: absolute path + + Returns dictionary of mappings + """ + with open(abs_path, encoding="utf-8") as label_tsv: + labels = list(csv.reader(label_tsv, delimiter="\t")) + return labels diff --git a/nemo_text_processing/text_normalization/zh/data/char/punctuations_zh.tsv b/nemo_text_processing/text_normalization/zh/data/char/punctuations_zh.tsv index 963b07d12..3848d54f9 100644 --- a/nemo_text_processing/text_normalization/zh/data/char/punctuations_zh.tsv +++ b/nemo_text_processing/text_normalization/zh/data/char/punctuations_zh.tsv @@ -70,3 +70,5 @@ … ‧ ﹏ +< +> \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/zh/data/math/symbol.tsv b/nemo_text_processing/text_normalization/zh/data/math/symbol.tsv index 7f16f52a4..4eff86d48 100644 --- a/nemo_text_processing/text_normalization/zh/data/math/symbol.tsv +++ b/nemo_text_processing/text_normalization/zh/data/math/symbol.tsv @@ -5,3 +5,4 @@ × 乘 ÷ 除 ° 度 +- 减 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/zh/data/measure/units_en.tsv b/nemo_text_processing/text_normalization/zh/data/measure/units_en.tsv index 6d45b4a3b..b1a8a832a 100644 --- a/nemo_text_processing/text_normalization/zh/data/measure/units_en.tsv +++ b/nemo_text_processing/text_normalization/zh/data/measure/units_en.tsv @@ -1,7 +1,5 @@ amu 原子质量 bar 巴 -° 度 -º 度 °c 摄氏度 °C 摄氏度 ºc 摄氏度 @@ -40,23 +38,6 @@ kw 千瓦 kW 千瓦 lb 磅 lbs 磅 -m2 平方米 -m² 平方米 -m3 立方米 -m³ 立方米 -mbps 兆比特每秒 -mg 毫克 -mhz 兆赫兹 -mi2 平方英里 -mi² 平方英里 -mi 英里 -min 分钟哦 -ml 毫升 -mm2 平方毫米 -mm² 平方毫米 -mol 摩尔 -mpa 兆帕 -mph 英里每小时 ng 纳克 nm 纳米 ns 纳秒 @@ -80,13 +61,7 @@ gb 吉字节 gpa 吉帕斯卡 gy 戈瑞 ha 公顷 -m 米 -mm 毫米 -ms 毫秒 -mv 毫伏 -mw 毫瓦 pg 皮克 ps 皮秒 s 秒 -ms 毫秒 g 克 diff --git a/nemo_text_processing/text_normalization/zh/data/measure/units_zh.tsv b/nemo_text_processing/text_normalization/zh/data/measure/units_zh.tsv deleted file mode 100644 index 5ca1dd9ab..000000000 --- a/nemo_text_processing/text_normalization/zh/data/measure/units_zh.tsv +++ /dev/null @@ -1,211 +0,0 @@ -匹 -张 -座 -回 -场 -尾 -条 -个 -首 -阙 -阵 -网 -炮 -顶 -丘 -棵 -只 -支 -袭 -辆 -挑 -担 -颗 -壳 -窠 -曲 -墙 -群 -腔 -砣 -座 -客 -贯 -扎 -捆 -刀 -令 -打 -手 -罗 -坡 -山 -岭 -江 -溪 -钟 -队 -单 -双 -对 -口 -头 -脚 -板 -跳 -枝 -件 -贴 -针 -线 -管 -名 -位 -身 -堂 -课 -本 -页 -家 -户 -层 -丝 -毫 -厘 -分 -钱 -两 -斤 -担 -铢 -石 -钧 -锱 -忽 -克 -毫 -厘 -分 -寸 -尺 -丈 -里 -寻 -常 -铺 -程 -米 -撮 -勺 -合 -升 -斗 -石 -盘 -碗 -碟 -叠 -桶 -笼 -盆 -盒 -杯 -钟 -斛 -锅 -簋 -篮 -盘 -桶 -罐 -瓶 -壶 -卮 -盏 -箩 -箱 -煲 -啖 -袋 -钵 -年 -月 -日 -季 -刻 -时 -周 -天 -秒 -分 -旬 -纪 -岁 -世 -更 -夜 -春 -夏 -秋 -冬 -代 -伏 -辈 -丸 -泡 -粒 -颗 -幢 -堆 -条 -根 -支 -道 -面 -片 -张 -颗 -块 -架 -千米 -分米 -厘米 -毫米 -微米 -纳米 -亿 -千万 -百万 -万 -千 -百 -亿块 -千万块 -百万块 -万块 -千块 -百块 -亿角 -千万角 -百万角 -万角 -千角 -百角 -亿毛 -千万毛 -百万毛 -万毛 -千毛 -百毛 -亿分 -千万分 -百万分 -万分 -千分 -百分 -亿元 -千万元 -百万元 -万元 -千元 -百元 diff --git a/nemo_text_processing/text_normalization/zh/data/money/currency_major.tsv b/nemo_text_processing/text_normalization/zh/data/money/currency_major.tsv index 88e6cc544..b80833507 100644 --- a/nemo_text_processing/text_normalization/zh/data/money/currency_major.tsv +++ b/nemo_text_processing/text_normalization/zh/data/money/currency_major.tsv @@ -168,7 +168,6 @@ Ft 匈牙利福林 ₪ 以色列谢克尔 J$ 牙买加元 лв 哈萨克斯坦腾格 -₩ 朝鲜园 лв 吉尔吉斯斯坦索姆 ₭ 老挝基普 ден 马其顿代纳尔 diff --git a/nemo_text_processing/text_normalization/zh/data/number/digit_alt.tsv b/nemo_text_processing/text_normalization/zh/data/number/digit_alt.tsv new file mode 100644 index 000000000..b949b9508 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/number/digit_alt.tsv @@ -0,0 +1,9 @@ +1 一 +2 两 +3 三 +4 四 +5 五 +6 六 +7 七 +8 八 +9 九 diff --git a/nemo_text_processing/text_normalization/zh/data/number/suffix.tsv b/nemo_text_processing/text_normalization/zh/data/number/suffix.tsv new file mode 100644 index 000000000..f44c0e151 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/number/suffix.tsv @@ -0,0 +1,23 @@ +万 +十万 +百万 +千万 +亿 +十亿 +百亿 +千亿 +萬 +十萬 +百萬 +千萬 +億 +十億 +百億 +千億 +拾萬 +佰萬 +仟萬 +拾億 +佰億 +仟億 + diff --git a/nemo_text_processing/text_normalization/zh/data/number/teen.tsv b/nemo_text_processing/text_normalization/zh/data/number/teen.tsv new file mode 100644 index 000000000..52dc01917 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/number/teen.tsv @@ -0,0 +1,10 @@ +10 十 +11 十一 +12 十二 +13 十三 +14 十四 +15 十五 +16 十六 +17 十七 +18 十八 +19 十九 diff --git a/nemo_text_processing/text_normalization/zh/data/number/teen_alt.tsv b/nemo_text_processing/text_normalization/zh/data/number/teen_alt.tsv new file mode 100644 index 000000000..a48662621 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/number/teen_alt.tsv @@ -0,0 +1,10 @@ +10 一十 +11 一十一 +12 一十二 +13 一十三 +14 一十四 +15 一十五 +16 一十六 +17 一十七 +18 一十八 +19 一十九 diff --git a/nemo_text_processing/text_normalization/zh/data/number/ties.tsv b/nemo_text_processing/text_normalization/zh/data/number/ties.tsv new file mode 100644 index 000000000..2a73c0399 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/number/ties.tsv @@ -0,0 +1,8 @@ +2 二十 +3 三十 +4 四十 +5 五十 +6 六十 +7 七十 +8 八十 +9 九十 diff --git a/nemo_text_processing/text_normalization/zh/data/whitelist.tsv b/nemo_text_processing/text_normalization/zh/data/whitelist.tsv index 133143950..e8810f42a 100644 --- a/nemo_text_processing/text_normalization/zh/data/whitelist.tsv +++ b/nemo_text_processing/text_normalization/zh/data/whitelist.tsv @@ -77,12 +77,6 @@ C C t v CCTV kfc KFC K F C KFC Steam steam -phd 博士 -PhD 博士 -Dr. 医生 -Mr. 先生 -Mrs. 女士 -Ms. 小姐 O 2 O O to O O2O O to O P 2 P P to P @@ -161,4 +155,3 @@ cctv CCTV C C t v CCTV kfc KFC K F C KFC -Steam steam diff --git a/nemo_text_processing/text_normalization/zh/graph_utils.py b/nemo_text_processing/text_normalization/zh/graph_utils.py index 20e7532b6..f2ad527ae 100644 --- a/nemo_text_processing/text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/text_normalization/zh/graph_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -41,13 +41,18 @@ NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() - NEMO_SIGMA = pynini.closure(NEMO_CHAR) +NEMO_NOT_ALPHA = pynini.difference(NEMO_SIGMA, NEMO_ALPHA).optimize() +NEMO_SPACE_CHAR = pynini.union(NEMO_CHAR, NEMO_SPACE) delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) insert_space = pynutil.insert(" ") delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"")) +) def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): diff --git a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py index 3756ba6c8..21437e82f 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,617 +16,177 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_DIGIT, GraphFst from nemo_text_processing.text_normalization.zh.utils import get_abs_path class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. - '23' -> cardinal { integer: "二十三" } - -10000 -> cardinal { negative: "负" integer: "一万" } - +10000 -> cardinal { positive: "正" integer: "一万" } + Finite state transducer for classifying cardinals + e.g. 23 -> cardinal { integer: "二十三" } """ - def __init__(self, deterministic: bool = True, lm: bool = False): + def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) + graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) + graph_digit_alt = pynini.string_file(get_abs_path("data/number/digit_alt.tsv")) + graph_ties = pynini.string_file(get_abs_path("data/number/ties.tsv")) + graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv")) + graph_teen_alt = pynini.string_file(get_abs_path("data/number/teen_alt.tsv")) + + alls = NEMO_DIGIT ** 2 | NEMO_DIGIT ** 1 + graph_all = ( + (graph_ties + (graph_digit | pynutil.delete('0'))) | graph_teen_alt | graph_digit + ) # graph_all when within a larger number e.g., 316-> 三百一十六 instead of 三百十六 + + graph_all = alls @ graph_all + graph_all_alt = ( + (graph_ties + (graph_digit | pynutil.delete('0'))) | graph_teen | graph_digit + ) # graph_all when at the head of the larger numbere.g., 13万 -> 十三万 instead of 一十三万 + graph_all_alt = alls @ graph_all_alt + + hundreds = NEMO_DIGIT ** 3 + graph_hundred_component = (graph_digit + pynutil.insert('百')) + pynini.union( + pynini.closure(pynutil.delete('0')), + (pynini.closure(pynutil.delete('0') + pynutil.insert('零')) + graph_all), + ) + graph_hundred = hundreds @ graph_hundred_component + + self.digit = graph_digit.optimize() + self.all = graph_all.optimize() + + thousands = NEMO_DIGIT ** 4 + graph_thousand_component = (graph_digit_alt + pynutil.insert('千')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_thousand = thousands @ graph_thousand_component + + ten_thousands = NEMO_DIGIT ** 5 + graph_ten_thousand_component = (graph_digit_alt + pynutil.insert('万')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + pynutil.insert('零') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_ten_thousand = ten_thousands @ graph_ten_thousand_component + + hundred_thousands = NEMO_DIGIT ** 6 + hundred_thousands_position = NEMO_DIGIT ** 2 + hundred_thousands_position = hundred_thousands_position @ graph_all_alt + graph_hundred_thousand_component = (hundred_thousands_position + pynutil.insert('万')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + pynutil.insert('零') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component + + millions = NEMO_DIGIT ** 7 + million_position = NEMO_DIGIT ** 3 + million_position = million_position @ graph_hundred_component + graph_million_component = (million_position + pynutil.insert('万')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + pynutil.insert('零') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_million = millions @ graph_million_component + + ten_millions = NEMO_DIGIT ** 8 + ten_million_position = NEMO_DIGIT ** 4 + ten_million_position = ten_million_position @ graph_thousand_component + graph_ten_million_component = (ten_million_position + pynutil.insert('万')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + pynutil.insert('零') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_ten_million = ten_millions @ graph_ten_million_component + + hundred_millions = NEMO_DIGIT ** 9 + graph_hundred_million_component = (graph_digit_alt + pynutil.insert('亿')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + pynutil.insert('零') + graph_million_component), + (pynutil.delete('00') + pynutil.insert('零') + graph_hundred_thousand_component), + (pynutil.delete('000') + pynutil.insert('零') + graph_ten_thousand_component), + (pynutil.delete('0000') + pynutil.insert('零') + graph_thousand_component), + (pynutil.delete('00000') + pynutil.insert('零') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_hundred_million = hundred_millions @ graph_hundred_million_component + + thousand_millions = NEMO_DIGIT ** 10 + thousand_millions_position = NEMO_DIGIT ** 2 + thousand_millions_position = thousand_millions_position @ graph_all_alt + graph_thousand_million_component = (thousand_millions_position + pynutil.insert('亿')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + pynutil.insert('零') + graph_million_component), + (pynutil.delete('00') + pynutil.insert('零') + graph_hundred_thousand_component), + (pynutil.delete('000') + pynutil.insert('零') + graph_ten_thousand_component), + (pynutil.delete('0000') + pynutil.insert('零') + graph_thousand_component), + ((pynutil.delete('00000') + pynutil.insert('零') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_thousand_million = thousand_millions @ graph_thousand_million_component + + ten_billions = NEMO_DIGIT ** 11 + ten_billions_position = NEMO_DIGIT ** 3 + ten_billions_position = ten_billions_position @ graph_hundred_component + graph_ten_billions_component = (ten_billions_position + pynutil.insert('亿')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + pynutil.insert('零') + graph_million_component), + (pynutil.delete('00') + pynutil.insert('零') + graph_hundred_thousand_component), + (pynutil.delete('000') + pynutil.insert('零') + graph_ten_thousand_component), + (pynutil.delete('0000') + pynutil.insert('零') + graph_thousand_component), + ((pynutil.delete('00000') + pynutil.insert('零') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_ten_billions = ten_billions @ graph_ten_billions_component + + hundred_billions = NEMO_DIGIT ** 12 + hundred_billions_position = NEMO_DIGIT ** 4 + hundred_billions_position = hundred_billions_position @ graph_thousand_component + graph_hundred_billions_component = (hundred_billions_position + pynutil.insert('亿')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + pynutil.insert('零') + graph_million_component), + (pynutil.delete('00') + pynutil.insert('零') + graph_hundred_thousand_component), + (pynutil.delete('000') + pynutil.insert('零') + graph_ten_thousand_component), + (pynutil.delete('0000') + pynutil.insert('零') + graph_thousand_component), + ((pynutil.delete('00000') + pynutil.insert('零') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + pynutil.insert('零') + graph_all), + ) + graph_hundred_billions = hundred_billions @ graph_hundred_billions_component - # imports - zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) - digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) - digit_tens = pynini.string_file(get_abs_path("data/number/digit_tens.tsv")) - - # morphemes inserted + punctuation - tens_digit = pynutil.insert('十') - hundred_digit = pynutil.insert('百') - thousand_digit = pynutil.insert('千') - tenthousand_digit = pynutil.insert('万') - hundredmillion_digit = pynutil.insert('亿') - delete_punct = pynini.closure(pynutil.delete(',') | pynutil.delete(',')) - - # 十几; 10-19 - graph_teen = ( - pynini.closure(delete_punct) - + pynini.cross('1', '十') - + ( - (pynini.closure(delete_punct) + (pynini.closure(delete_punct) + digit)) - | (pynini.closure(delete_punct) + pynini.cross('0', '')) - ) - ) - - # 十几; 10-19 but when not alone, but within a larger number, (e.g, 119) - graph_teen_alt = ( - (pynini.closure(delete_punct) + (pynini.cross('1', '一十') + pynini.closure(delete_punct) + digit)) - | (pynini.closure(delete_punct) + pynini.cross('10', '一十')) - | (pynini.closure(delete_punct) + (pynini.cross('1,0', '一十') | pynini.cross('1,0', '一十'))) - ) # when the teen is not by itself but with in a larger number - - # 几十; 20-99 - graph_tens = ( - pynini.closure(delete_punct) - + (digit_tens + tens_digit + pynini.closure(delete_punct) + ((pynini.closure(delete_punct) + digit))) - ) | ( - digit_tens + tens_digit + (pynini.closure(delete_punct) + (pynini.cross('0', '') | pynini.cross(',0', ''))) - ) - - # 百; 100-999; hundreds - graph_hundred = ( - ( - digit - + ( - pynutil.delete('00') - | (pynutil.delete(',00') | pynutil.delete(',00')) - | (pynutil.delete('0,0') | pynutil.delete('0,0')) - ) - + hundred_digit - ) - | (digit + hundred_digit + (graph_tens | graph_teen_alt)) - | ( - digit - + hundred_digit - + ( - (pynini.cross(',0', '零') | pynini.cross(',0', '零')) - | pynini.cross('0', '零') - | (pynini.cross('0,', '零') | pynini.cross('0,', '零')) - ) - + digit - ) - ) - - # 千; 1000-9999; thousands - graph_thousand = ( - ( - digit - + ( - (pynutil.delete(',000') | pynutil.delete('000') | pynutil.delete('0,00') | pynutil.delete('00,0')) - | ( - pynutil.delete(',000') - | pynutil.delete('000') - | pynutil.delete('0,00') - | pynutil.delete('00,0') - ) - ) - + thousand_digit - ) - | (digit + pynini.closure(delete_punct) + thousand_digit + graph_hundred) - | ( - digit - + thousand_digit - + (pynini.cross('0', '零') | ((pynini.cross(',0', '零') | pynini.cross(',0', '零')))) - + (graph_tens | graph_teen_alt) - ) - | ( - digit - + pynini.closure(delete_punct) - + thousand_digit - + ( - pynini.cross('00', '零') - | (pynini.cross(',00', '零') | pynini.cross(',00', '零')) - | (pynini.cross('0,0', '零') | pynini.cross('0,0', '零')) - | (pynini.cross('00,', '零') | pynini.cross('00,', '零')) - ) - + digit - ) - ) - - # 万; 10000-99999; ten thousands - graph_tenthousand = ( - ( - digit - + (pynutil.delete('0000') | (pynutil.delete('0,000') | pynutil.delete('0,000'))) - + tenthousand_digit - ) - | (digit + tenthousand_digit + graph_thousand) - | ( - digit - + tenthousand_digit - + (pynini.cross('0', '零') | (pynini.cross('0,', '零') | pynini.cross('0,', '零'))) - + graph_hundred - ) - | ( - digit - + tenthousand_digit - + (pynini.cross('00', '零') | (pynini.cross('0,0', '零') | pynini.cross('0,0', '零'))) - + (graph_tens | graph_teen_alt) - ) - | ( - digit - + tenthousand_digit - + (pynini.cross('000', '零') | (pynini.cross('0,00', '零') | pynini.cross('0,00', '零'))) - + digit - ) - ) - - # 十万; 100000-999999; hundred thousands - graph_hundredthousand = ( - pynutil.add_weight( - ( - (graph_tens | graph_teen) - + tenthousand_digit - + (pynutil.delete('0000') | (pynutil.delete('0,000') | pynutil.delete('0,000'))) - ), - -0.1, - ) - | ((graph_tens | graph_teen) + tenthousand_digit + graph_thousand) - | ( - (graph_tens | graph_teen) - + tenthousand_digit - + (pynini.cross('0', '零') | (pynini.cross('0,', '零') | pynini.cross('0,', '零'))) - + graph_hundred - ) - | ( - (graph_tens | graph_teen) - + tenthousand_digit - + (pynini.cross('00', '零') | (pynini.cross('0,0', '零') | pynini.cross('0,0', '零'))) - + (graph_tens | graph_teen_alt) - ) - | ( - (graph_tens | graph_teen) - + tenthousand_digit - + (pynini.cross('000', '零') | (pynini.cross('0,00', '零') | pynini.cross('0,00', '零'))) - + digit - ) - ) - - # 百万; 1000000-9999999; millions - graph_million = ( - pynutil.add_weight( - ( - graph_hundred - + tenthousand_digit - + (pynutil.delete('0000') | (pynutil.delete('0,000') | pynutil.delete('0,000'))) - ), - -1.0, - ) - | (graph_hundred + tenthousand_digit + graph_thousand) - | ( - graph_hundred - + tenthousand_digit - + (pynini.cross('0', '零') | (pynini.cross('0,', '零') | pynini.cross('0,', '零'))) - + graph_hundred - ) - | ( - graph_hundred - + tenthousand_digit - + (pynini.cross('00', '零') | (pynini.cross('0,0', '零') | pynini.cross('0,0', '零'))) - + (graph_tens | graph_teen_alt) - ) - | ( - graph_hundred - + tenthousand_digit - + (pynini.cross('000', '零') | (pynini.cross('0,00', '零') | pynini.cross('0,00', '零'))) - + digit - ) - ) - - # 千万; 10000000-99999999; ten millions - graph_tenmillion = ( - pynutil.add_weight( - ( - graph_thousand - + (pynutil.delete('0000') | (pynutil.delete('0,000') | pynutil.delete('0,000'))) - + tenthousand_digit - ), - -1.0, - ) - | (graph_thousand + tenthousand_digit + graph_thousand) - | ( - graph_thousand - + tenthousand_digit - + (pynini.cross('0', '零') | (pynini.cross('0,', '零') | pynini.cross('0,', '零'))) - + graph_hundred - ) - | ( - graph_thousand - + tenthousand_digit - + (pynini.cross('00', '零') | (pynini.cross('0,0', '零') | pynini.cross('0,0', '零'))) - + (graph_tens | graph_teen_alt) - ) - | ( - graph_thousand - + tenthousand_digit - + (pynini.cross('000', '零') | (pynini.cross('0,00', '零') | pynini.cross('0,00', '零'))) - + digit - ) - ) - - # 亿; 100000000-999999999; hundred millions - graph_hundredmillion = ( - pynutil.add_weight( - ( - digit - + (pynutil.delete('00000000') | (pynutil.delete('00,000,000') | pynutil.delete('00,000,000'))) - + hundredmillion_digit - ), - -2.0, - ) - | pynutil.add_weight((digit + hundredmillion_digit + graph_tenmillion), -1.9) - | pynutil.add_weight((digit + hundredmillion_digit + pynutil.delete('0') + graph_million), -1.8) - | pynutil.add_weight( - (digit + hundredmillion_digit + pynutil.delete('00') + pynutil.insert('零') + graph_hundredthousand), - -1.7, - ) - | pynutil.add_weight( - ( - digit - + hundredmillion_digit - + (pynutil.delete('000') | (pynutil.delete('00,0') | pynutil.delete('00,0'))) - + pynutil.insert('零') - + graph_tenthousand - ), - -1.6, - ) - | pynutil.add_weight( - ( - digit - + hundredmillion_digit - + (pynutil.delete('0000') | (pynutil.delete('00,00') | pynutil.delete('00,00'))) - + pynutil.insert('零') - + graph_thousand - ), - -1.5, - ) - | pynutil.add_weight( - ( - digit - + hundredmillion_digit - + (pynutil.delete('00000') | (pynutil.delete('00,000,') | pynutil.delete('00,000,'))) - + pynutil.insert('零') - + graph_hundred - ), - -1.4, - ) - | pynutil.add_weight( - ( - digit - + hundredmillion_digit - + (pynutil.delete('000000') | (pynutil.delete('00,000,0') | pynutil.delete('00,000,0'))) - + pynutil.insert('零') - + (graph_tens | graph_teen_alt) - ), - -1.3, - ) - | pynutil.add_weight( - ( - digit - + hundredmillion_digit - + (pynutil.delete('0000000') | (pynutil.delete('00,000,00') | pynutil.delete('00,000,00'))) - + pynutil.insert('零') - + digit - ), - -1.2, - ) - ) - - # 十亿; 1000000000-9999999999; billions - graph_billion = ( - pynutil.add_weight( - ( - (graph_tens | graph_teen) - + (pynutil.delete('00000000') | (pynutil.delete('00,000,000') | pynutil.delete('00,000,000'))) - + hundredmillion_digit - ), - -2.0, - ) - | pynutil.add_weight(((graph_tens | graph_teen) + hundredmillion_digit + graph_tenmillion), -1.9) - | pynutil.add_weight( - ((graph_tens | graph_teen) + hundredmillion_digit + pynutil.delete('0') + graph_million), -1.8 - ) - | pynutil.add_weight( - ( - (graph_tens | graph_teen) - + hundredmillion_digit - + pynutil.delete('00') - + pynutil.insert('零') - + graph_hundredthousand - ), - -1.7, - ) - | pynutil.add_weight( - ( - (graph_tens | graph_teen) - + hundredmillion_digit - + (pynutil.delete('000') | (pynutil.delete('00,0') | pynutil.delete('00,0'))) - + pynutil.insert('零') - + graph_tenthousand - ), - -1.6, - ) - | pynutil.add_weight( - ( - (graph_tens | graph_teen) - + hundredmillion_digit - + (pynutil.delete('0000') | (pynutil.delete('00,00') | pynutil.delete('00,00'))) - + pynutil.insert('零') - + graph_thousand - ), - -1.5, - ) - | pynutil.add_weight( - ( - (graph_tens | graph_teen) - + hundredmillion_digit - + (pynutil.delete('00000') | (pynutil.delete('00,000,') | pynutil.delete('00,000,'))) - + pynutil.insert('零') - + graph_hundred - ), - -1.4, - ) - | pynutil.add_weight( - ( - (graph_tens | graph_teen) - + hundredmillion_digit - + (pynutil.delete('000000') | (pynutil.delete('00,000,0') | pynutil.delete('00,000,0'))) - + pynutil.insert('零') - + (graph_tens | graph_teen_alt) - ), - -1.3, - ) - | pynutil.add_weight( - ( - (graph_tens | graph_teen) - + hundredmillion_digit - + (pynutil.delete('0000000') | (pynutil.delete('00,000,00') | pynutil.delete('00,000,00'))) - + pynutil.insert('零') - + digit - ), - -1.2, - ) - ) - - # 百亿; 10000000000-99999999999; ten billions - graph_tenbillion = ( - pynutil.add_weight( - ( - graph_hundred - + (pynutil.delete('00000000') | (pynutil.delete('00,000,000') | pynutil.delete('00,000,000'))) - + hundredmillion_digit - ), - -2.0, - ) - | pynutil.add_weight((graph_hundred + hundredmillion_digit + graph_tenmillion), -1.9) - | pynutil.add_weight((graph_hundred + hundredmillion_digit + pynutil.delete('0') + graph_million), -1.8) - | pynutil.add_weight( - ( - graph_hundred - + hundredmillion_digit - + pynutil.delete('00') - + pynutil.insert('零') - + graph_hundredthousand - ), - -1.7, - ) - | pynutil.add_weight( - ( - graph_hundred - + hundredmillion_digit - + (pynutil.delete('000') | (pynutil.delete('00,0') | pynutil.delete('00,0'))) - + pynutil.insert('零') - + graph_tenthousand - ), - -1.6, - ) - | pynutil.add_weight( - ( - graph_hundred - + hundredmillion_digit - + (pynutil.delete('0000') | (pynutil.delete('00,00') | pynutil.delete('00,00'))) - + pynutil.insert('零') - + graph_thousand - ), - -1.5, - ) - | pynutil.add_weight( - ( - graph_hundred - + hundredmillion_digit - + (pynutil.delete('00000') | (pynutil.delete('00,000,') | pynutil.delete('00,000,'))) - + pynutil.insert('零') - + graph_hundred - ), - -1.4, - ) - | pynutil.add_weight( - ( - graph_hundred - + hundredmillion_digit - + (pynutil.delete('000000') | (pynutil.delete('00,000,0') | pynutil.delete('00,000,0'))) - + pynutil.insert('零') - + (graph_tens | graph_teen_alt) - ), - -1.3, - ) - | pynutil.add_weight( - ( - graph_hundred - + hundredmillion_digit - + (pynutil.delete('0000000') | (pynutil.delete('00,000,00') | pynutil.delete('00,000,00'))) - + pynutil.insert('零') - + digit - ), - -1.2, - ) - ) - - # 千亿; 100000000000-999999999999; hundred billions - graph_hundredbillion = ( - pynutil.add_weight( - ( - graph_thousand - + hundredmillion_digit - + (pynutil.delete('00000000') | (pynutil.delete('00,000,000') | pynutil.delete('00,000,000'))) - ), - -2.0, - ) - | pynutil.add_weight((graph_thousand + hundredmillion_digit + graph_tenmillion), -1.9) - | pynutil.add_weight((graph_thousand + hundredmillion_digit + pynutil.delete('0') + graph_million), -1.8) - | pynutil.add_weight( - ( - graph_thousand - + hundredmillion_digit - + pynutil.delete('00') - + pynutil.insert('零') - + graph_hundredthousand - ), - -1.7, - ) - | pynutil.add_weight( - ( - graph_thousand - + hundredmillion_digit - + (pynutil.delete('000') | (pynutil.delete('00,0') | pynutil.delete('00,0'))) - + pynutil.insert('零') - + graph_tenthousand - ), - -1.6, - ) - | pynutil.add_weight( - ( - graph_thousand - + hundredmillion_digit - + (pynutil.delete('0000') | (pynutil.delete('00,00') | pynutil.delete('00,00'))) - + pynutil.insert('零') - + graph_thousand - ), - -1.5, - ) - | pynutil.add_weight( - ( - graph_thousand - + hundredmillion_digit - + (pynutil.delete('00000') | (pynutil.delete('00,000,') | pynutil.delete('00,000,'))) - + pynutil.insert('零') - + graph_hundred - ), - -1.4, - ) - | pynutil.add_weight( - ( - graph_thousand - + hundredmillion_digit - + (pynutil.delete('000000') | (pynutil.delete('00,000,0') | pynutil.delete('00,000,0'))) - + pynutil.insert('零') - + (graph_tens | graph_teen_alt) - ), - -1.3, - ) - | pynutil.add_weight( - ( - graph_thousand - + hundredmillion_digit - + (pynutil.delete('0000000') | (pynutil.delete('00,000,00') | pynutil.delete('00,000,00'))) - + pynutil.insert('零') - + digit - ), - -1.2, - ) - ) - - suffix = pynini.union( - "万", - "十万", - "百万", - "千万", - "亿", - "十亿", - "百亿", - "千亿", - "萬", - "十萬", - "百萬", - "千萬", - "億", - "十億", - "百億", - "千億", - "拾萬", - "佰萬", - "仟萬", - "拾億", - "佰億", - "仟億", - "拾万", - "佰万", - "仟万", - "仟亿", - "佰亿", - "仟亿", - "万亿", - "萬億", - ) - graph_mandarin = pynini.closure( - ( - ( - digit - | graph_teen - | graph_tens - | graph_hundred - | graph_thousand - | graph_tenthousand - | graph_hundredthousand - ) - + suffix - ) - ) - - # combining all the graph above graph = pynini.union( - pynutil.add_weight(graph_hundredbillion, -2.0), - pynutil.add_weight(graph_tenbillion, -1.9), - pynutil.add_weight(graph_billion, -1.8), - pynutil.add_weight(graph_hundredmillion, -1.7), - pynutil.add_weight(graph_tenmillion, -1.6), - pynutil.add_weight(graph_million, -1.5), - pynutil.add_weight(graph_hundredthousand, -1.4), - pynutil.add_weight(graph_tenthousand, -1.3), - pynutil.add_weight(graph_thousand, -1.2), - pynutil.add_weight(graph_hundred, -1.1), - pynutil.add_weight(graph_tens, -1.0), - graph_teen, - digit, - zero, - ) - - # adding optional +(正)/-(负) signs - graph_sign = ( - (pynutil.insert("positive: \"") + pynini.accep("正") + pynutil.insert("\"")) - | (pynutil.insert("negative: \"") + pynini.accep("负") + pynutil.insert("\"")) - | (pynutil.insert("negative: \"") + pynini.cross("負", "负") + pynutil.insert("\"")) - | (pynutil.insert("negative: \"") + pynini.cross("-", "负") + pynutil.insert("\"")) - | (pynutil.insert("positive: \"") + pynini.cross("+", "正") + pynutil.insert("\"")) - ) - - graph_mandarin_sign = graph_sign + pynutil.insert(" ") + graph_mandarin - # final graph - final_graph_sign = ( - graph_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") - ) - final_graph_numbers_only = pynutil.insert("integer: \"") + graph + pynutil.insert("\"") - # imprted when building other grammars - self.just_cardinals = graph | graph_mandarin | final_graph_sign | graph_mandarin_sign - graph_mandarins = pynutil.insert("integer: \"") + graph_mandarin + pynutil.insert("\"") - - final_graph = final_graph_numbers_only | final_graph_sign | graph_mandarins | graph_mandarin_sign + graph_hundred_billions, + graph_ten_billions, + graph_thousand_million, + graph_hundred_million, + graph_ten_million, + graph_million, + graph_hundred_thousand, + graph_ten_thousand, + graph_thousand, + graph_hundred, + graph_all_alt, + graph_zero, + ) + self.just_cardinals = graph.optimize() + optional_sign = ( + pynutil.insert("negative: \"") + (pynini.accep("-") | pynini.cross("负", "-")) + pynutil.insert("\"") + ) + final_graph = ( + optional_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") + ) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\"")) + + self.with_sign = final_graph.optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/taggers/date.py b/nemo_text_processing/text_normalization/zh/taggers/date.py index 92fbfce4d..607b63511 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/date.py +++ b/nemo_text_processing/text_normalization/zh/taggers/date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -58,12 +58,14 @@ def __init__(self, deterministic: bool = True, lm: bool = False): ) only_month = pynutil.insert("month: \"") + month + pynutil.delete('月') + pynutil.insert("\"") only_day = pynutil.insert("day: \"") + day + delete_day + pynutil.insert("\"") + # gh_1 graph_only_date = only_year | only_month | only_day year_month = only_year + pynutil.insert(' ') + only_month month_day = only_month + pynutil.insert(' ') + only_day - graph_all = only_year + pynutil.insert(' ') + only_month + pynutil.insert(' ') + only_day - graph_combination = year_month | month_day | graph_all + graph_ymd = only_year + pynutil.insert(' ') + only_month + pynutil.insert(' ') + only_day + # gh_2 + graph_combination = year_month | month_day | graph_ymd year_component = ( pynutil.insert("year: \"") @@ -73,8 +75,9 @@ def __init__(self, deterministic: bool = True, lm: bool = False): ) month_component = pynutil.insert("month: \"") + month + delete_sign + pynutil.insert("\"") day_component = pynutil.insert("day: \"") + day + pynutil.insert("\"") + # gp_3 graph_sign = year_component + pynutil.insert(' ') + month_component + pynutil.insert(' ') + day_component - + # gp_1+2+3 graph_all = graph_only_date | graph_sign | graph_combination prefix = ( @@ -86,11 +89,13 @@ def __init__(self, deterministic: bool = True, lm: bool = False): | pynini.accep('纪元前') ) prefix_component = pynutil.insert("era: \"") + prefix + pynutil.insert("\"") - graph_prefix = prefix_component + pynutil.insert(' ') + (pynutil.add_weight(graph_all, -2.0)) + # gp_prefix+(1,2,3) + graph_prefix = prefix_component + pynutil.insert(' ') + (graph_ymd | year_month | only_year) suffix_component = pynutil.insert("era: \"") + suffix + pynutil.insert("\"") - graph_suffix = (pynutil.add_weight(graph_all, -2.0)) + pynutil.insert(' ') + suffix_component - + # gp_suffix +(1,2,3) + graph_suffix = (graph_ymd | year_month | only_year) + pynutil.insert(' ') + suffix_component + # gp_4 graph_affix = graph_prefix | graph_suffix graph_suffix_year = ( diff --git a/nemo_text_processing/text_normalization/zh/taggers/decimal.py b/nemo_text_processing/text_normalization/zh/taggers/decimal.py index 8228777c7..d4afb3fd9 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/zh/taggers/decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -73,19 +73,25 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa super().__init__(name="decimal", kind="classify", deterministic=deterministic) cardinal_before_decimal = cardinal.just_cardinals - cardinal_after_decimal = pynini.string_file(get_abs_path("data/number/digit.tsv")) | pynini.closure( - pynini.cross('0', '零') - ) + cardinal_after_decimal = pynini.string_file(get_abs_path("data/number/digit.tsv")) + zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) + + graph_integer = pynutil.insert('integer_part: \"') + cardinal_before_decimal + pynutil.insert("\"") - decimal_point = pynini.closure(pynutil.delete('.'), 0, 1) - graph_integer = pynutil.insert("integer_part: \"") + cardinal_before_decimal + pynutil.insert("\"") graph_fraction = ( - pynutil.insert("fractional_part: \"") + pynini.closure(cardinal_after_decimal, 1) + pynutil.insert("\"") + pynutil.insert("fractional_part: \"") + + pynini.closure((pynini.closure(cardinal_after_decimal, 1) | (pynini.closure(zero, 1))), 1) + + pynutil.insert("\"") ) - graph_decimal = graph_integer + decimal_point + pynutil.insert(" ") + graph_fraction + graph_decimal = graph_integer + pynutil.delete('.') + pynutil.insert(" ") + graph_fraction + self.regular_decimal = graph_decimal.optimize() graph_sign = ( - (pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"负\"")) + pynutil.insert(" ")) + ( + pynini.closure(pynutil.insert("negative: \"") + pynini.cross("-", "负")) + + pynutil.insert("\"") + + pynutil.insert(" ") + ) ) | ( ( pynutil.insert('negative: ') @@ -98,14 +104,12 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = Fa graph_with_sign = graph_sign + graph_decimal graph_regular = graph_with_sign | graph_decimal - # graph_decimal_quantity = get_quantity(graph_decimal, cardinal.just_cardinals) graph_decimal_quantity = get_quantity(graph_decimal) graph_sign_quantity = graph_sign + graph_decimal_quantity graph_quantity = graph_decimal_quantity | graph_sign_quantity - # final_graph = graph_decimal | graph_sign | graph_decimal_quantity | graph_sign_quantity final_graph = graph_regular | graph_quantity - self.decimal = final_graph + self.decimal = final_graph.optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/taggers/fraction.py b/nemo_text_processing/text_normalization/zh/taggers/fraction.py index 6d68280b5..3f9ce42c7 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/zh/taggers/fraction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst +from nemo_text_processing.text_normalization.zh.utils import get_abs_path class FractionFst(GraphFst): @@ -31,14 +32,15 @@ class FractionFst(GraphFst): 98% -> tokens { fraction { denominator: "百" numerator: "九十八"} } Args: - cardinal: CardinalFst, decimal: DecimalFst + cardinal: CardinalFst, decimal: DecimalFst """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True, lm: bool = False): + def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = False): super().__init__(name="fraction", kind="classify", deterministic=deterministic) graph_cardinals = cardinal.just_cardinals - graph_decimal = decimal.decimal + graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) + graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) slash = pynutil.delete('/') morpheme = pynutil.delete('分之') @@ -75,7 +77,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = "仟亿", ) - integer_component = pynutil.insert("integer_part: \"") + graph_cardinals + pynutil.insert("\"") + integer_component = pynutil.insert('integer_part: \"') + graph_cardinals + pynutil.insert("\"") denominator_component = pynutil.insert("denominator: \"") + graph_cardinals + pynutil.insert("\"") numerator_component = pynutil.insert("numerator: \"") + graph_cardinals + pynutil.insert("\"") @@ -86,7 +88,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = + slash + pynutil.insert(' ') + denominator_component - ) + ) # 5又1/3 + graph_only_slash = numerator_component + slash + pynutil.insert(' ') + denominator_component graph_morpheme = (denominator_component + morpheme + pynutil.insert(' ') + numerator_component) | ( @@ -97,65 +100,68 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = + morpheme + pynutil.insert(' ') + numerator_component - ) + ) # 5又3分之1 graph_with_suffix = ( pynini.closure(pynutil.insert("denominator: \"") + suffix + pynutil.insert("\""), 0, 1) + morpheme + pynutil.insert(' ') + numerator_component - ) + ) # 万分之1 percentage = pynutil.delete('%') - graph_percentage = ( - numerator_component - + percentage - + pynutil.insert(' ') - + pynutil.insert("denominator: \"百") + + graph_decimal = ( + pynutil.insert('integer_part: \"') + + pynini.closure( + graph_cardinals + + pynutil.delete('.') + + pynutil.insert('点') + + pynini.closure((graph_digit | graph_zero), 1) + ) + pynutil.insert("\"") ) + graph_decimal_percentage = pynini.closure( + graph_decimal + percentage + pynutil.insert(' denominator: \"百"'), 1 + ) # 5.6% - graph_hundred = pynutil.delete('100%') + pynutil.insert('numerator: \"百\" denominator: \"百"') + graph_integer_percentage = pynini.closure( + (numerator_component) + percentage + pynutil.insert(' denominator: \"百"'), 1 + ) # 5% - graph_optional_sign = ( - (pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"负\""))) - | (pynini.closure(pynutil.insert("positive: ") + pynini.cross("+", "\"正\""))) - | (pynutil.insert("positive: ") + pynutil.insert("\"") + pynini.accep('正') + pynutil.insert("\"")) - | ( - pynutil.insert('negative: ') - + pynutil.insert("\"") - + (pynini.accep('负') | pynini.cross('負', '负')) - + pynutil.insert("\"") - ) - ) + graph_hundred = pynutil.delete('100%') + pynutil.insert('numerator: \"百\" denominator: \"百"') + # 100% - graph_decimals = ( - graph_decimal - + pynutil.insert(" ") - + percentage - + pynutil.insert("denominator: \"百") + graph_optional_sign = (pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"负\""))) | ( + pynutil.insert('negative: ') + + pynutil.insert("\"") + + (pynini.accep('负') | pynini.cross('負', '负')) + pynutil.insert("\"") ) - graph = ( - graph_with_integer - | graph_only_slash - | graph_morpheme - | graph_with_suffix - | graph_percentage - | graph_decimals - | pynutil.add_weight(graph_hundred, -3.0) + graph = pynini.union( + graph_with_integer, + graph_only_slash, + graph_morpheme, + graph_with_suffix, + graph_decimal_percentage, + graph_integer_percentage, + graph_hundred, ) graph_with_sign = ( (graph_optional_sign + pynutil.insert(" ") + graph_with_integer) | (graph_optional_sign + pynutil.insert(" ") + graph_only_slash) | (graph_optional_sign + pynutil.insert(" ") + graph_morpheme) | (graph_optional_sign + pynutil.insert(" ") + graph_with_suffix) - | (graph_optional_sign + pynutil.insert(" ") + graph_percentage) - | pynutil.add_weight((graph_optional_sign + pynutil.insert(" ") + graph_hundred), -3.0) + | (graph_optional_sign + pynutil.insert(" ") + graph_integer_percentage) + | (graph_optional_sign + pynutil.insert(" ") + graph_decimal_percentage) + | (graph_optional_sign + pynutil.insert(" ") + graph_hundred) ) - final_graph = graph | graph_with_sign + final_graph = graph | pynutil.add_weight(graph_with_sign, -3.0) + + self.just_fractions = graph.optimize() + self.fractions = final_graph.optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/taggers/math_symbol.py b/nemo_text_processing/text_normalization/zh/taggers/math_symbol.py deleted file mode 100644 index d6ae0be9c..000000000 --- a/nemo_text_processing/text_normalization/zh/taggers/math_symbol.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import pynini -from pynini.lib import pynutil - -from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst -from nemo_text_processing.text_normalization.zh.taggers.cardinal import CardinalFst -from nemo_text_processing.text_normalization.zh.utils import get_abs_path - - -class MathSymbol(GraphFst): - ''' - + -> tokens { sign: "加" } - ''' - - def __init__(self, deterministic: bool = True, lm: bool = False): - super().__init__(name="sign", kind="classify", deterministic=deterministic) - ''' - add your sign in data/math/symbol.tsv,this graph just convert sigh to character,you can add more - cases with detailed cases - ''' - score_sign = pynini.string_file(get_abs_path("data/math/score.tsv")) | pynini.string_file( - get_abs_path("data/math/symbol.tsv") - ) - score = ( - pynutil.insert("score: \"") - + pynini.closure(score_sign, 0, 1) - + CardinalFst().just_cardinals - + score_sign - + CardinalFst().just_cardinals - + pynutil.insert("\"") - ) - graph = score - self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/taggers/measure.py b/nemo_text_processing/text_normalization/zh/taggers/measure.py index 3fa61cffe..d7da8f524 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/measure.py +++ b/nemo_text_processing/text_normalization/zh/taggers/measure.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import pynini from pynini.lib import pynutil @@ -18,33 +20,57 @@ from nemo_text_processing.text_normalization.zh.utils import get_abs_path -class Measure(GraphFst): +class MeasureFst(GraphFst): ''' 1kg -> tokens { measure { cardinal { integer: "一" } units: "千克" } } ''' - def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True, lm: bool = False): + def __init__( + self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True, lm: bool = False + ): super().__init__(name="measure", kind="classify", deterministic=deterministic) units_en = pynini.string_file(get_abs_path("data/measure/units_en.tsv")) - units_zh = pynini.string_file(get_abs_path("data/measure/units_zh.tsv")) - graph_cardinal = cardinal.just_cardinals - integer_component = pynutil.insert("integer: \"") + graph_cardinal + pynutil.insert("\"") - unit_component = pynutil.insert("units: \"") + (units_en | units_zh) + pynutil.insert("\"") - graph_cardinal_measure = integer_component + insert_space + unit_component + graph_cardinal = cardinal.with_sign + graph_decimal = decimal.decimal + + # these units ared added due to falures when running Sparrow Hawk tests that "ms" would be processed as "m" and "s" left outside of the tagegr + units = ( + pynini.cross("ms", "毫秒") + | pynini.cross("m²", "平方米") + | pynini.cross("m2", "平方米") + | pynini.cross("m²", "平方米") + | pynini.cross("m³", "立方米") + | pynini.cross("mbps", "兆比特每秒") + | pynini.cross("mg", "毫克") + | pynini.cross("mhz", "兆赫兹") + | pynini.cross("mi2", "平方英里") + | pynini.cross("mi²", "平方英里") + | pynini.cross("mi", "英里") + | pynini.cross("min", "分钟") + | pynini.cross("ml", "毫升") + | pynini.cross("mm2", "平方毫米") + | pynini.cross("mm²", "平方毫米") + | pynini.cross("mol", "摩尔") + | pynini.cross("mpa", "兆帕") + | pynini.cross("mph", "英里每小时") + | pynini.cross("mm", "毫米") + | pynini.cross("mv", "毫伏") + | pynini.cross("mw", "毫瓦") + ) + + unit_component = pynutil.insert("units: \"") + (units_en | units) + pynutil.insert("\"") - decimal = decimal.decimal - graph_decimal = ( - decimal + insert_space + pynutil.insert("units: \"") + (units_en | units_zh) + pynutil.insert("\"") + graph_cardinal_measure = pynini.closure( + (pynutil.insert("cardinal { ") + graph_cardinal + pynutil.insert(" } ") + insert_space + unit_component), 1 ) - graph_sign = ( - (pynutil.insert("negative: \"") + pynini.accep("负") + pynutil.insert("\"")) - | (pynutil.insert("negative: \"") + pynini.cross("負", "负") + pynutil.insert("\"")) - | (pynutil.insert("negative: \"") + pynini.cross("-", "负") + pynutil.insert("\"")) + graph_decimal_measure = pynini.closure( + (pynutil.insert("decimal { ") + graph_decimal + pynutil.insert(" } ") + unit_component), 1 ) - graph = pynini.closure(graph_sign + insert_space) + (graph_cardinal_measure | graph_decimal) + graph_measures = graph_decimal_measure | graph_cardinal_measure - self.fst = self.add_tokens(graph).optimize() + final_graph = self.add_tokens(graph_measures) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/taggers/money.py b/nemo_text_processing/text_normalization/zh/taggers/money.py index 93fa59e61..786319627 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/text_normalization/zh/taggers/money.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,6 +19,40 @@ from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.text_normalization.zh.utils import get_abs_path +# def get_quantity(decimal): +suffix = pynini.union( + "万", + "十万", + "百万", + "千万", + "亿", + "十亿", + "百亿", + "千亿", + "萬", + "十萬", + "百萬", + "千萬", + "億", + "十億", + "百億", + "千億", + "拾萬", + "佰萬", + "仟萬", + "拾億", + "佰億", + "仟億", + "拾万", + "佰万", + "仟万", + "仟亿", + "佰亿", + "仟亿", + "万亿", + "萬億", +) + class MoneyFst(GraphFst): """ @@ -27,18 +61,19 @@ class MoneyFst(GraphFst): '23美元' -> money { integer: "二十三" currency: "美元" } """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True, lm: bool = False): + def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = False): super().__init__(name="money", kind="classify", deterministic=deterministic) cardinal = cardinal.just_cardinals - decimal = decimal.decimal currency = pynini.string_file(get_abs_path("data/money/currency_major.tsv")) currency_mandarin = pynini.string_file(get_abs_path("data/money/currency_mandarin.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) + graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) # regular money gramamr with currency symbols $1000 currency_component = pynutil.insert("currency: \"") + currency + pynutil.insert("\"") - number_component = pynutil.insert("integer: \"") + cardinal + pynutil.insert("\"") + number_component = pynutil.insert("integer_part: \"") + (cardinal | (cardinal + suffix)) + pynutil.insert("\"") graph_regular_money = currency_component + pynutil.insert(" ") + number_component # 块 元 毛 with optional symbols @@ -54,8 +89,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = currency_mandarin_component = pynutil.insert("currency: \"") + currency_mandarin + pynutil.insert("\"") unit_components = ( (pynutil.insert("currency: \"") + unit_major + pynutil.insert("\"")) - | (pynutil.insert("currency_major: \"") + unit_minor + pynutil.insert("\"")) - | (pynutil.insert("currency_minor: \"") + unit_minor_alt + pynutil.insert("\"")) + | (pynutil.insert("currency_maj: \"") + unit_minor + pynutil.insert("\"")) + | (pynutil.insert("currency_min: \"") + unit_minor_alt + pynutil.insert("\"")) ) graph_unit_only = ( @@ -70,12 +105,33 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = graph_mandarin_money = number_component + pynutil.insert(" ") + currency_mandarin_component # larger money as decimals - graph_decimal_money = (decimal + pynutil.insert(" ") + currency_mandarin_component) | ( - currency_component + pynutil.insert(" ") + decimal + graph_decimal = ( + pynutil.insert('integer_part: \"') + + pynini.closure( + pynini.closure(cardinal, 1) + + pynutil.delete('.') + + pynutil.insert('点') + + pynini.closure((graph_digit | graph_zero), 1) + ) + + pynutil.insert("\"") + ) + graph_decimal_money = ( + pynini.closure(graph_decimal, 1) + + pynini.closure(pynutil.insert(' quantity: \"') + suffix + pynutil.insert('\"')) + + pynutil.insert(" ") + + pynini.closure(currency_mandarin_component, 1) + ) | ( + pynini.closure(currency_component, 1) + + pynutil.insert(" ") + + pynini.closure(graph_decimal, 1) + + pynini.closure(pynutil.insert(" ") + pynutil.insert('quantity: \"') + suffix + pynutil.insert('\"')) ) graph = ( - graph_regular_money | graph_units | pynutil.add_weight(graph_mandarin_money, -3.0) | graph_decimal_money + graph_regular_money + | graph_units + | pynutil.add_weight(graph_mandarin_money, -3.0) + | pynutil.add_weight(graph_decimal_money, -1.0) ) final_graph = graph diff --git a/nemo_text_processing/text_normalization/zh/taggers/ordinal.py b/nemo_text_processing/text_normalization/zh/taggers/ordinal.py index 258a9068c..e09dd8047 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/zh/taggers/ordinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,8 +28,8 @@ class OrdinalFst(GraphFst): cardinal: CardinalFst """ - def __init__(self, cardinal: GraphFst): - super().__init__(name="ordinal", kind="verbalize") + def __init__(self, cardinal: GraphFst, deterministic: bool = True, lm: bool = False): + super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) graph_cardinal = cardinal.just_cardinals morpheme = pynini.accep('第') diff --git a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py index df612fd8d..82e1c174f 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py +++ b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import pynini from pynini.lib import pynutil @@ -18,7 +20,7 @@ from nemo_text_processing.text_normalization.zh.utils import get_abs_path -class PreProcessor(GraphFst): +class PreProcessorFst(GraphFst): ''' Preprocessing of TN: 1. interjections removal such as '啊, 呃' diff --git a/nemo_text_processing/text_normalization/zh/taggers/punctuation.py b/nemo_text_processing/text_normalization/zh/taggers/punctuation.py index cff124834..d6920c75d 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/punctuation.py +++ b/nemo_text_processing/text_normalization/zh/taggers/punctuation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + import sys from unicodedata import category @@ -36,7 +37,7 @@ class PunctuationFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="punctuation", kind="classify", deterministic=deterministic) - s = "!#%&\'()*+,-./:;<=>?@^_`{|}~\"。,;-《》“”" + s = "!#%&\'()*+,-/:;<=>?@^_`{|}~\"。,;-《》“”" punct_symbols_to_exclude = ["[", "]"] punct_unicode = [ diff --git a/nemo_text_processing/text_normalization/zh/taggers/time.py b/nemo_text_processing/text_normalization/zh/taggers/time.py index 283b8c47b..b0248d5c3 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/time.py +++ b/nemo_text_processing/text_normalization/zh/taggers/time.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -41,9 +41,9 @@ def __init__(self, deterministic: bool = True, lm: bool = False): # gramamr for time, separated by colons 05:03:13 symbol = pynutil.delete(":") | pynutil.delete(":") - hour_component = pynutil.insert("hour: \"") + hour + pynutil.insert('点') + pynutil.insert("\"") - minute_component = pynutil.insert("minute: \"") + minute + pynutil.insert('分') + pynutil.insert("\"") - second_component = pynutil.insert("second: \"") + second + pynutil.insert('秒') + pynutil.insert("\"") + hour_component = pynutil.insert("hours: \"") + hour + pynutil.insert('点') + pynutil.insert("\"") + minute_component = pynutil.insert("minutes: \"") + minute + pynutil.insert('分') + pynutil.insert("\"") + second_component = pynutil.insert("seconds: \"") + second + pynutil.insert('秒') + pynutil.insert("\"") # combining 3 components hour_minute_second = ( hour_component @@ -75,12 +75,12 @@ def __init__(self, deterministic: bool = True, lm: bool = False): minute_duration = pynini.accep("分钟") | pynini.accep('刻') | pynini.accep('刻钟') second_duration = pynini.accep("秒钟") | pynini.cross('秒鐘', '秒钟') | pynini.accep('秒') # combining two above - hour_component = pynutil.insert("hour: \"") + hour + (hour_clock | hour_duration) + pynutil.insert("\"") + hour_component = pynutil.insert("hours: \"") + hour + (hour_clock | hour_duration) + pynutil.insert("\"") minute_component = ( - pynutil.insert("minute: \"") + minute + (minute_clock | minute_duration) + pynutil.insert("\"") + pynutil.insert("minutes: \"") + minute + (minute_clock | minute_duration) + pynutil.insert("\"") ) second_component = ( - pynutil.insert("second: \"") + second + (second_clock | second_duration) + pynutil.insert("\"") + pynutil.insert("seconds: \"") + second + (second_clock | second_duration) + pynutil.insert("\"") ) hour_minute = hour_component + pynutil.insert(' ') + minute_component hour_second = hour_component + pynutil.insert(' ') + second_component @@ -97,7 +97,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): ) # gramamr for time, back count; 五点差n分n秒 - backcount = pynutil.insert("verb: \"") + pynini.accep('差') + pynutil.insert("\"") + backcount = pynutil.insert("morphosyntactic_features: \"") + pynini.accep('差') + pynutil.insert("\"") graph_hour = ( ( pynini.closure(backcount) diff --git a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py index 822f3d00f..d35ea178b 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/zh/taggers/tokenize_and_classify.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,41 +12,32 @@ # See the License for the specific language governing permissions and # limitations under the License. + import os import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.zh.graph_utils import ( - NEMO_CHAR, - NEMO_DIGIT, - GraphFst, - delete_extra_space, - delete_space, - generator_main, -) +from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst, generator_main from nemo_text_processing.text_normalization.zh.taggers.cardinal import CardinalFst from nemo_text_processing.text_normalization.zh.taggers.date import DateFst from nemo_text_processing.text_normalization.zh.taggers.decimal import DecimalFst from nemo_text_processing.text_normalization.zh.taggers.fraction import FractionFst -from nemo_text_processing.text_normalization.zh.taggers.math_symbol import MathSymbol -from nemo_text_processing.text_normalization.zh.taggers.measure import Measure +from nemo_text_processing.text_normalization.zh.taggers.measure import MeasureFst from nemo_text_processing.text_normalization.zh.taggers.money import MoneyFst from nemo_text_processing.text_normalization.zh.taggers.ordinal import OrdinalFst -from nemo_text_processing.text_normalization.zh.taggers.preprocessor import PreProcessor from nemo_text_processing.text_normalization.zh.taggers.punctuation import PunctuationFst from nemo_text_processing.text_normalization.zh.taggers.time import TimeFst from nemo_text_processing.text_normalization.zh.taggers.whitelist import WhiteListFst -from nemo_text_processing.text_normalization.zh.taggers.word import Char -from nemo_text_processing.utils.logging import logger +from nemo_text_processing.text_normalization.zh.taggers.word import WordFst class ClassifyFst(GraphFst): """ - Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased. - For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, @@ -59,7 +50,7 @@ class ClassifyFst(GraphFst): def __init__( self, input_case: str, - deterministic: bool = False, + deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None, @@ -70,84 +61,40 @@ def __init__( if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" - far_file = os.path.join( - # cache_dir, f"_{input_case}_zh_tn_{deterministic}_deterministic{whitelist_file}.far" - cache_dir, - f"_{input_case}_zh_tn_{deterministic}_deterministic_{whitelist_file}.far", - ) + far_file = os.path.join(cache_dir, f"zh_tn_{deterministic}_deterministic_{whitelist_file}_tokenize.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] - no_digits = pynini.closure(pynini.difference(NEMO_CHAR, NEMO_DIGIT)) - self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize() - logger.info(f"ClassifyFst.fst was restored from {far_file}.") else: - logger.info(f"Creating ClassifyFst grammars. This might take some time...") - - cardinal = CardinalFst() - cardinal_graph = cardinal.fst - - ordinal = OrdinalFst(cardinal=cardinal) - ordinal_graph = ordinal.fst - - decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) - decimal_graph = decimal.fst - - fraction = FractionFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic) - fraction_graph = fraction.fst - + cardinal = CardinalFst(deterministic=deterministic) date = DateFst(deterministic=deterministic) - date_graph = date.fst - - word_graph = Char(deterministic=deterministic).fst - - self.time = TimeFst(deterministic=deterministic) - time_graph = self.time.fst - - money = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic) - money_graph = money.fst - - self.math = MathSymbol(deterministic=deterministic) - math_graph = self.math.fst - - self.measure = Measure(cardinal=cardinal, decimal=decimal, deterministic=deterministic) - measure_graph = self.measure.fst - - self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) - whitelist_graph = self.whitelist.fst - punct_graph = PunctuationFst(deterministic=deterministic).fst - - classify = ( - pynutil.add_weight(whitelist_graph, 1.001) - | pynutil.add_weight(cardinal_graph, -2.0) - | pynutil.add_weight(time_graph, 1.1) - | pynutil.add_weight(fraction_graph, -1.1) - | pynutil.add_weight(date_graph, -1.0) - | pynutil.add_weight(ordinal_graph, 1.1) - | pynutil.add_weight(decimal_graph, -1.0) - | pynutil.add_weight(money_graph, -1.1) - | pynutil.add_weight(math_graph, 1.1) - | pynutil.add_weight(measure_graph, -1.1) - | pynutil.add_weight(word_graph, 1.1) - ) - - classify |= pynutil.add_weight(word_graph, 100) - - punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") - token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") - token_plus_punct = ( - pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) + time = TimeFst(deterministic=deterministic) + fraction = FractionFst(cardinal=cardinal, deterministic=deterministic) + money = MoneyFst(cardinal=cardinal, deterministic=deterministic) + measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) + ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) + whitelist = WhiteListFst(deterministic=deterministic) + word = WordFst(deterministic=deterministic) + punctuation = PunctuationFst(deterministic=deterministic) + + classify = pynini.union( + pynutil.add_weight(date.fst, 1.1), + pynutil.add_weight(fraction.fst, 1.0), + pynutil.add_weight(money.fst, 1.1), + pynutil.add_weight(measure.fst, 1.05), + pynutil.add_weight(time.fst, 1.1), + pynutil.add_weight(whitelist.fst, 1.1), + pynutil.add_weight(cardinal.fst, 1.1), + pynutil.add_weight(decimal.fst, 3.05), + pynutil.add_weight(ordinal.fst, 1.1), + pynutil.add_weight(punctuation.fst, 1.0), + pynutil.add_weight(word.fst, 100), ) - graph = token_plus_punct + pynini.closure(pynutil.add_weight(delete_extra_space, 1.1) + token_plus_punct) - graph = delete_space + graph + delete_space - - # self.fst = graph.optimize() - tagger = graph.optimize() - preprocessor = PreProcessor(remove_interjections=True, fullwidth_to_halfwidth=True,) - self.fst = preprocessor.fst @ tagger + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") + tagger = pynini.closure(token, 1) - no_digits = pynini.closure(pynini.difference(NEMO_CHAR, NEMO_DIGIT)) - self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize() + self.fst = tagger if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) diff --git a/nemo_text_processing/text_normalization/zh/taggers/whitelist.py b/nemo_text_processing/text_normalization/zh/taggers/whitelist.py index 5b6196102..9015bd047 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/zh/taggers/whitelist.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,33 +33,25 @@ class WhiteListFst(GraphFst): input_file: path to a file with whitelist replacements """ - def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): + def __init__(self, deterministic: bool = True, input_file: str = None): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) - def _get_whitelist_graph(input_case, file): + def _get_whitelist_graph(file): whitelist = load_labels(file) - if input_case == "lower_cased": - whitelist = [[x[0].lower()] + x[1:] for x in whitelist] graph = pynini.string_map(whitelist) return graph - graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv")) - if not deterministic and input_case != "lower_cased": - graph |= pynutil.add_weight( - _get_whitelist_graph("lower_cased", get_abs_path("data/whitelist.tsv")), weight=0.0001 - ) + graph = _get_whitelist_graph(get_abs_path("data/whitelist.tsv")) + + graph |= pynutil.add_weight(_get_whitelist_graph(get_abs_path("data/whitelist.tsv")), weight=0.0001) if input_file: - whitelist_provided = _get_whitelist_graph(input_case, input_file) + whitelist_provided = _get_whitelist_graph(input_file) if not deterministic: graph |= whitelist_provided else: graph = whitelist_provided - if not deterministic: - units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measure/measurements.tsv")) - graph |= units_graph - self.graph = graph self.final_graph = convert_space(self.graph).optimize() self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize() diff --git a/nemo_text_processing/text_normalization/zh/taggers/word.py b/nemo_text_processing/text_normalization/zh/taggers/word.py index 776e4afdc..4e3b42b00 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/word.py +++ b/nemo_text_processing/text_normalization/zh/taggers/word.py @@ -1,30 +1,34 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pynini -from pynini.lib import pynutil - -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_SPACE, GraphFst - - -class Char(GraphFst): - ''' - 你 -> char { name: "你" } - ''' - - def __init__(self, deterministic: bool = True, lm: bool = False): - super().__init__(name="char", kind="classify", deterministic=deterministic) - - graph = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") - self.fst = graph.optimize() +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying word. + e.g. dormir -> tokens { name: "dormir" } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="word", kind="classify") + word = pynutil.insert("name: \"") + NEMO_NOT_QUOTE + pynutil.insert("\"") + self.fst = word.optimize() diff --git a/nemo_text_processing/text_normalization/zh/utils.py b/nemo_text_processing/text_normalization/zh/utils.py index d2748380e..4d08f1deb 100644 --- a/nemo_text_processing/text_normalization/zh/utils.py +++ b/nemo_text_processing/text_normalization/zh/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/zh/verbalizers/cardinal.py index 0cd9c3193..1a28241af 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,7 +30,7 @@ class CardinalFst(GraphFst): def __init__(self, deterministic: bool = True, lm: bool = False): super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic) - delete_sign = pynini.cross("negative: \"负\"", "负") | pynini.cross("positive: \"正\"", "正") + delete_sign = pynini.cross("negative: \"-\"", "负") delete_integer = ( pynutil.delete("integer: ") + pynutil.delete("\"") @@ -44,6 +44,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): ) graph_sign = delete_sign + delete_space + delete_integer final_graph = delete_integer | graph_sign | graph_mandarin + self.numbers = final_graph delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/date.py b/nemo_text_processing/text_normalization/zh/verbalizers/date.py index 86405bcff..f69f4a797 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + import pynini from pynini.lib import pynutil @@ -55,6 +56,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): optional_era = ( pynutil.delete("era: ") + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") ) + graph_date = ( pynini.closure(year_component) + pynini.closure(delete_space) @@ -62,7 +64,20 @@ def __init__(self, deterministic: bool = True, lm: bool = False): + pynini.closure(delete_space) + pynini.closure(day_component) ) - graph_date_era = optional_era + delete_space + graph_date + + graph_date_era = pynini.union( + (optional_era + delete_space + year_component), + (optional_era + delete_space + year_component + delete_space + month_component), + ( + optional_era + + delete_space + + year_component + + delete_space + + month_component + + delete_space + + day_component + ), + ) graph_date_all = graph_date | graph_date_era @@ -84,6 +99,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): ) final_graph = graph_date_all | graph_range + # final_graph = optional_era + delete_space + year_component delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/decimal.py b/nemo_text_processing/text_normalization/zh/verbalizers/decimal.py index 05fb2045e..795ab01a6 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,21 +30,9 @@ class DecimalFst(GraphFst): def __init__(self, deterministic: bool = True, lm: bool = False): super().__init__(name="decimal", kind="verbalize", deterministic=deterministic) - integer = ( - pynutil.delete("integer_part:") - + delete_space - + pynutil.delete("\"") - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") - ) + integer = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - fractional = ( - pynutil.delete("fractional_part:") - + delete_space - + pynutil.delete("\"") - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") - ) + fractional = pynutil.delete("fractional_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") quantity = ( pynutil.delete("quantity:") @@ -63,6 +51,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): ) graph = integer + delete_space + pynutil.insert("点") + fractional + self.decimal_regular = graph graph_quantity = graph + delete_space + quantity graph_regular = graph | graph_quantity diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/fraction.py b/nemo_text_processing/text_normalization/zh/verbalizers/fraction.py index 8207c1a22..c2a719c16 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/fraction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -72,10 +72,19 @@ def __init__(self, decimal: GraphFst, deterministic: bool = True, lm: bool = Fal ) graph_no_integer = denominator_part + delete_space + pynutil.insert('分之') + numerator_part graph = graph_with_integer | graph_no_integer - graph_with_sign = sign_part + delete_space + graph - graph_with_decimal = denominator_part + delete_space + pynutil.insert('分之') + graph_decimal + + graph_with_decimal = ( + denominator_part + + delete_space + + pynutil.insert('分之') + + pynutil.delete("integer_part: \"") + + pynini.closure(NEMO_NOT_QUOTE) + + pynutil.delete("\"") + ) + graph_with_sign = sign_part + delete_space + (graph | graph_with_decimal) final_graph = graph_with_sign | graph | graph_with_decimal + self.fraction = final_graph delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/math_symbol.py b/nemo_text_processing/text_normalization/zh/verbalizers/math_symbol.py deleted file mode 100644 index 59ef1c31a..000000000 --- a/nemo_text_processing/text_normalization/zh/verbalizers/math_symbol.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import pynini -from pynini.lib import pynutil - -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst - - -class MathSymbol(GraphFst): - ''' - tokens { sign: "加" } -> 加 - ''' - - def __init__(self, deterministic: bool = True, lm: bool = False): - super().__init__(name="sign", kind="verbalize", deterministic=deterministic) - - graph = pynutil.delete('score: \"') + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete('\"') - - self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/measure.py b/nemo_text_processing/text_normalization/zh/verbalizers/measure.py index ff4d0df07..00ba3b8ed 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/measure.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,41 +11,61 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space -class Measure(GraphFst): +class MeasureFst(GraphFst): ''' tokens { measure { cardinal: "一" } units: "千克" } } -> 一千克 ''' - def __init__(self, deterministic: bool = True, lm: bool = False): + def __init__( + self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True, lm: bool = False + ): super().__init__(name="measure", kind="verbalize", deterministic=deterministic) + cardinal = cardinal.numbers + decimal = decimal.decimal_component sign_component = pynutil.delete("negative: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") - integer_component = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") unit_component = pynutil.delete("units: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") - cardinal_graph = integer_component + delete_space + unit_component + graph_cardinal = ( + pynutil.delete("cardinal { ") + cardinal + pynutil.delete(" } ") + delete_space + unit_component + ) - decimal_graph = ( - pynutil.delete("integer_part: \"") - + pynini.closure(NEMO_NOT_QUOTE) - + pynutil.delete("\"") - + pynutil.insert("点") + graph_decimal = ( + pynutil.delete("decimal {") + + pynini.closure(pynutil.delete(NEMO_SPACE)) + + decimal + + pynini.closure(pynutil.delete(NEMO_SPACE)) + + pynutil.delete("}") + + pynini.closure(pynutil.delete(NEMO_SPACE)) + delete_space - + pynutil.delete("fractional_part: \"") - + pynini.closure(NEMO_NOT_QUOTE, 0) - + pynutil.delete("\"") + + unit_component + ) + + graph_fraction = ( + pynutil.delete("fraction {") + + pynini.closure(pynutil.delete(NEMO_SPACE)) + + fraction.fraction + + pynini.closure(pynutil.delete(NEMO_SPACE)) + + pynutil.delete("}") + + pynini.closure(pynutil.delete(NEMO_SPACE)) + delete_space - + pynutil.delete("units: \"") - + pynini.closure(NEMO_NOT_QUOTE) - + pynutil.delete("\"") + + unit_component ) - graph = pynini.closure(sign_component + delete_space) + (cardinal_graph | decimal_graph) + graph_math_cardinal = pynutil.delete("cardinal { ") + cardinal + pynutil.delete(" } ") + + graph_measures = graph_decimal | graph_cardinal | graph_fraction + graph_maths = graph_math_cardinal + + final_graph = graph_maths | graph_measures - self.fst = self.delete_tokens(graph).optimize() + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/money.py b/nemo_text_processing/text_normalization/zh/verbalizers/money.py index 9e121bbc6..74f517d01 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/money.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -31,13 +31,13 @@ def __init__(self, decimal: GraphFst, deterministic: bool = True, lm: bool = Fal super().__init__(name="money", kind="verbalize", deterministic=deterministic) # components to combine to make graphs - number_component = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + number_component = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") currency_component = pynutil.delete("currency: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") decimal_component = decimal.decimal_component unit_only_component = ( (pynutil.delete("currency: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"")) - | (pynutil.delete("currency_major: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"")) - | (pynutil.delete("currency_minor: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"")) + | (pynutil.delete("currency_maj: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"")) + | (pynutil.delete("currency_min: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"")) ) # graphs diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/zh/verbalizers/ordinal.py index 0379c06fe..d019355e2 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/ordinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,8 +25,8 @@ class OrdinalFst(GraphFst): tokens { ordinal { integer: "第一千万" } } -> 第一千万 """ - def __init__(self): - super().__init__(name="ordinal", kind="verbalize") + def __init__(self, deterministic: bool = True, lm: bool = False): + super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) symbol = pynini.union("-", "~", "——", "—") dash = pynini.cross(symbol, "到") diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/zh/verbalizers/post_processing.py new file mode 100644 index 000000000..4bafef0bd --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/verbalizers/post_processing.py @@ -0,0 +1,113 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os + +import pynini + +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_NOT_SPACE, + NEMO_SIGMA, + delete_space, + generator_main, +) +from nemo_text_processing.utils.logging import logger + + +class PostProcessingFst: + """ + Finite state transducer that post-processing an entire sentence after verbalization is complete, e.g. + removes extra spaces around punctuation marks " ( one hundred and twenty three ) " -> "(one hundred and twenty three)" + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, "zh_tn_post_processing.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["post_process_graph"] + logger.info(f'Post processing graph was restored from {far_file}.') + else: + self.set_punct_dict() + self.fst = self.get_punct_postprocess_graph() + + if far_file: + generator_main(far_file, {"post_process_graph": self.fst}) + + def set_punct_dict(self): + self.punct_marks = { + "'": [ + "'", + '´', + 'ʹ', + 'ʻ', + 'ʼ', + 'ʽ', + 'ʾ', + 'ˈ', + 'ˊ', + 'ˋ', + '˴', + 'ʹ', + '΄', + '՚', + '՝', + 'י', + '׳', + 'ߴ', + 'ߵ', + 'ᑊ', + 'ᛌ', + '᾽', + '᾿', + '`', + '´', + '῾', + '‘', + '’', + '‛', + '′', + '‵', + 'ꞌ', + ''', + '`', + '𖽑', + '𖽒', + ], + } + + def get_punct_postprocess_graph(self): + """ + Returns graph to post process punctuation marks. + + {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. + By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. + """ + + remove_space_around_single_quote = pynini.cdrewrite( + delete_space, NEMO_NOT_SPACE, NEMO_NOT_SPACE, pynini.closure(NEMO_SIGMA) + ) + # this works if spaces in between (good) + # delete space between 2 NEMO_NOT_SPACE(left and right to the space) that are with in a content of NEMO_SIGMA + + graph = remove_space_around_single_quote.optimize() + + return graph diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py b/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py index 36394843c..a63769787 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import pynini from pynini.lib import pynutil, utf8 diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/time.py b/nemo_text_processing/text_normalization/zh/verbalizers/time.py index aa3baf046..11105a916 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/time.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -38,9 +38,9 @@ def __init__(self, deterministic: bool = True): alphabet_pm = pynini.string_file(get_abs_path("data/time/PM.tsv")) # fundamental components - hour_component = pynutil.delete("hour: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") - minute_component = pynutil.delete("minute: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") - second_component = pynutil.delete("second: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + hour_component = pynutil.delete("hours: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + minute_component = pynutil.delete("minutes: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + second_component = pynutil.delete("seconds: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") graph_regular = ( hour_component | minute_component @@ -52,7 +52,7 @@ def __init__(self, deterministic: bool = True): ) # back count 三点差五分 - delete_verb = pynutil.delete("verb: \"") + pynini.accep("差") + pynutil.delete("\"") + delete_verb = pynutil.delete("morphosyntactic_features: \"") + pynini.accep("差") + pynutil.delete("\"") graph_back_count = ( ( pynini.closure(delete_verb + pynutil.insert(' ')) diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py index da4d64ca0..221fbcbc7 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,26 +11,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import pynini -from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst +from nemo_text_processing.text_normalization.zh.graph_utils import GraphFst, delete_space from nemo_text_processing.text_normalization.zh.verbalizers.cardinal import CardinalFst from nemo_text_processing.text_normalization.zh.verbalizers.date import DateFst from nemo_text_processing.text_normalization.zh.verbalizers.decimal import DecimalFst from nemo_text_processing.text_normalization.zh.verbalizers.fraction import FractionFst -from nemo_text_processing.text_normalization.zh.verbalizers.math_symbol import MathSymbol -from nemo_text_processing.text_normalization.zh.verbalizers.measure import Measure +from nemo_text_processing.text_normalization.zh.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.zh.verbalizers.money import MoneyFst from nemo_text_processing.text_normalization.zh.verbalizers.ordinal import OrdinalFst from nemo_text_processing.text_normalization.zh.verbalizers.time import TimeFst from nemo_text_processing.text_normalization.zh.verbalizers.whitelist import Whitelist -from nemo_text_processing.text_normalization.zh.verbalizers.word import Char +from nemo_text_processing.text_normalization.zh.verbalizers.word import WordFst class VerbalizeFst(GraphFst): """ Composes other verbalizer grammars. - For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. Args: deterministic: if True will provide a single transduction option, @@ -42,28 +43,27 @@ def __init__(self, deterministic: bool = True): date = DateFst(deterministic=deterministic) cardinal = CardinalFst(deterministic=deterministic) - char = Char(deterministic=deterministic) + ordinal = OrdinalFst(deterministic=deterministic) decimal = DecimalFst(deterministic=deterministic) + word = WordFst(deterministic=deterministic) fraction = FractionFst(decimal=decimal, deterministic=deterministic) - math_symbol = MathSymbol(deterministic=deterministic) money = MoneyFst(decimal=decimal, deterministic=deterministic) - measure = Measure(deterministic=deterministic) - ordinal = OrdinalFst() + measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) time = TimeFst(deterministic=deterministic) whitelist = Whitelist(deterministic=deterministic) graph = pynini.union( date.fst, cardinal.fst, + ordinal.fst, decimal.fst, fraction.fst, - char.fst, - math_symbol.fst, + word.fst, money.fst, measure.fst, - ordinal.fst, time.fst, whitelist.fst, ) + graph = pynini.closure(delete_space) + graph + pynini.closure(delete_space) self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py index e4b0927d0..b16625530 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ from nemo_text_processing.text_normalization.zh.verbalizers.postprocessor import PostProcessor from nemo_text_processing.text_normalization.zh.verbalizers.verbalize import VerbalizeFst -# from nemo_text_processing.utils.logging import logger +# from nemo.utils import logging class VerbalizeFinalFst(GraphFst): @@ -38,6 +38,7 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ self.fst = pynini.Far(far_file, mode="r")["verbalize"] else: token_graph = VerbalizeFst(deterministic=deterministic) + token_verbalizer = ( pynutil.delete("tokens {") + delete_space + token_graph.fst + delete_space + pynutil.delete(" }") ) @@ -46,5 +47,3 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ postprocessor = PostProcessor(remove_puncts=False, to_upper=False, to_lower=False, tag_oov=False,) self.fst = (verbalizer @ postprocessor.fst).optimize() - if far_file: - generator_main(far_file, {"verbalize": self.fst}) diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py index 3be84e0a0..662cf9f28 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/whitelist.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import pynini from pynini.lib import pynutil diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/word.py b/nemo_text_processing/text_normalization/zh/verbalizers/word.py index bdcafef96..f30f254c5 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/word.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/word.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,12 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + from pynini.lib import pynutil from nemo_text_processing.text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst -class Char(GraphFst): +class WordFst(GraphFst): ''' tokens { char: "你" } -> 你 ''' diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_cardinal.txt index 11f3f8cc1..573042f7a 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_cardinal.txt @@ -1,161 +1,85 @@ 10000~一万 -负10000~负一万 -正100000~正十万 +负1000~负一千 +100000~十万 210~二百一十 -负200~负二百 -负219~负二百一十九 +200~二百 +219~二百一十九 5000~五千 5100~五千一百 5110~五千一百一十 5111~五千一百一十一 5010~五千零一十 -5099~五千零九十九 +5010~五千零一十 5001~五千零一 -5,000~五千 -5,100~五千一百 -5,010~五千零一十 -5,001~五千零一 -50,000~五万 -51,000~五万一千 -50,100~五万零一百 -50,010~五万零一十 -51,100~五万一千一百 -51,110~五万一千一百一十 -50,011~五万零一十一 -50,001~五万零一 -50,010~五万零一十 50000~五万 51000~五万一千 50100~五万零一百 50010~五万零一十 -50001~五万零一 51100~五万一千一百 51110~五万一千一百一十 -51111~五万一千一百一十一 -50111~五万零一百一十一 50011~五万零一十一 -500,000~五十万 -510,000~五十一万 -501,000~五十万一千 -500,100~五十万零一百 -500,010~五十万零一十 -500,111~五十万零一百一十一 -501,100~五十万一千一百 -501,111~五十万一千一百一十一 -511,111~五十一万一千一百一十一 -100,000~十万 -110,000~十一万 +50001~五万零一 +50010~五万零一十 +50000~五万 +51000~五万一千 +50100~五万零一百 +50010~五万零一十 +50001~五万零一 +51100~五万一千一百 +50011~五万零一十一 +500000~五十万 +510000~五十一万 +501000~五十万一千 +500100~五十万零一百 +100000~十万 +110000~十一万 500000~五十万 510000~五十一万 501000~五十万一千 500100~五十万零一百 500010~五十万零一十 -500001~五十万零一 -500111~五十万零一百一十一 -501100~五十万一千一百 501111~五十万一千一百一十一 511111~五十一万一千一百一十一 100000~十万 110000~十一万 1100000~一百一十万 -1010000~一百零一万 -1001000~一百万一千 -1000100~一百万零一百 -1000010~一百万零一十 1000001~一百万零一 1000000~一百万 -1,000,000~一百万 -1,100,000~一百一十万 -1,010,000~一百零一万 -1,001,000~一百万一千 -1,000,100~一百万零一百 -1,000,010~一百万零一十 -1,000,001~一百万零一 10000000~一千万 11000000~一千一百万 10100000~一千零一十万 10010000~一千零一万 +11000000~一千一百万 +10100000~一千零一十万 +10010000~一千零一万 10001000~一千万一千 -10010100~一千零一万零一百 -10010010~一千零一万零一十 -10010001~一千零一万零一 -10,000,000~一千万 -11,000,000~一千一百万 -10,100,000~一千零一十万 -10,010,000~一千零一万 -10,001,000~一千万一千 -10,010,100~一千零一万零一百 -10,010,010~一千零一万零一十 -10,010,001~一千零一万零一 -101111111~一亿一百一十一万一千一百一十一 +101111111~一亿零一百一十一万一千一百一十一 110111111~一亿一千零一十一万一千一百一十一 111011111~一亿一千一百零一万一千一百一十一 111101111~一亿一千一百一十万一千一百一十一 111110111~一亿一千一百一十一万零一百一十一 -111111011~一亿一千一百一十一万一千零一十一 -111111101~一亿一千一百一十一万一千一百零一 -111111111~一亿一千一百一十一万一千一百一十一 -101,111,111~一亿一百一十一万一千一百一十一 -110,111,111~一亿一千零一十一万一千一百一十一 -111,011,111~一亿一千一百零一万一千一百一十一 -111,101,111~一亿一千一百一十万一千一百一十一 -111,110,111~一亿一千一百一十一万零一百一十一 -111,111,011~一亿一千一百一十一万一千零一十一 -111,111,101~一亿一千一百一十一万一千一百零一 -111,111,110~一亿一千一百一十一万一千一百一十 -111,111,111~一亿一千一百一十一万一千一百一十一 +101111111~一亿零一百一十一万一千一百一十一 +110111111~一亿一千零一十一万一千一百一十一 +111011111~一亿一千一百零一万一千一百一十一 +111101111~一亿一千一百一十万一千一百一十一 1011111111~十亿一千一百一十一万一千一百一十一 -1101111111~十一亿一百一十一万一千一百一十一 -1110111111~十一亿一千零一十一万一千一百一十一 +1101111111~十一亿零一百一十一万一千一百一十一 1111011111~十一亿一千一百零一万一千一百一十一 -1111110111~十一亿一千一百一十一万零一百一十一 -1111111101~十一亿一千一百一十一万一千一百零一 -1111111111~十一亿一千一百一十一万一千一百一十一 -1,011,111,111~十亿一千一百一十一万一千一百一十一 -1,101,111,111~十一亿一百一十一万一千一百一十一 -1,110,111,111~十一亿一千零一十一万一千一百一十一 -1,111,011,111~十一亿一千一百零一万一千一百一十一 -1,111,101,111~十一亿一千一百一十万一千一百一十一 -1,111,110,111~十一亿一千一百一十一万零一百一十一 -1,111,111,011~十一亿一千一百一十一万一千零一十一 -1,111,111,101~十一亿一千一百一十一万一千一百零一 -1,111,111,110~十一亿一千一百一十一万一千一百一十 +11000000000~一百一十亿 +10000100000~一百亿零十万 11000000000~一百一十亿 10100000000~一百零一亿 10010000000~一百亿一千万 -10001000000~一百亿一百万 -10000100000~一百亿零十万 -11,000,000,000~一百一十亿 -10,100,000,000~一百零一亿 -10,010,000,000~一百亿一千万 -10,001,000,000~一百亿一百万 -10,000,100,000~一百亿零十万 -10,000,010,000~一百亿零一万 -10,000,001,000~一百亿零一千 -10,000,000,100~一百亿零一百 -10,000,000,010~一百亿零一十 -10,000,000,001~一百亿零一 +10001000000~一百亿零一百万 +10000000001~一百亿零一 100000100000~一千亿零十万 100000010000~一千亿零一万 100000001000~一千亿零一千 -100000000100~一千亿零一百 -100000000010~一千亿零一十 -100000000001~一千亿零一 -100,000,000,000~一千亿 -110,000,000,000~一千一百亿 -101,000,000,000~一千零一十亿 -100,100,000,000~一千零一亿 -100,010,000,000~一千亿一千万 -100,001,000,000~一千亿一百万 -100,000,100,000~一千亿零十万 -100,000,010,000~一千亿零一万 -100,000,001,000~一千亿零一千 -100,000,000,100~一千亿零一百 -20万~二十万 -5万~五万 -100万~一百万 -1500万~一千五百万 -20亿~二十亿 -100亿~一百亿 -1500亿~一千五百亿 -9亿~九亿 +101000000000~一千零一十亿 +100100000000~一千零一亿 +我今天买了5个苹果~我今天买了五个苹果 +我今天买了25个苹果~我今天买了二十五个苹果 +我今天买了35个苹果~我今天买了三十五个苹果 +我今天买了50000个苹果~我今天买了五万个苹果 +我今天买了150000个苹果~我今天买了十五万个苹果 +双辽境内除东辽河、西辽河等5条河流~双辽境内除东辽河、西辽河等五条河流 diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_date.txt index c0963e9d0..52ab15f44 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_date.txt @@ -27,4 +27,9 @@ 2020年11月24日~二零二零年十一月二十四日 公元2020年11月24日~公元二零二零年十一月二十四日 1823年3月bc~公元前一八二三年三月 -纪元2013年~纪元二零一三年 \ No newline at end of file +纪元2013年~纪元二零一三年 +今天是2013年3月3日~今天是二零一三年三月三日 +现在是12月25日圣诞节~现在是十二月二十五日圣诞节 +文件上的标注日期是12/2/2~文件上的标注日期是一二年二月二日 +现在是入冬的12月~现在是入冬的十二月 +公元前202年西汉成立~公元前二零二年西汉成立 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_decimal.txt index 10e49a97f..4d53200fd 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_decimal.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_decimal.txt @@ -5,26 +5,31 @@ -5.5555~负五点五五五五 1890.5555~一千八百九十点五五五五 20.123~二十点一二三 --2930.1929~负二千九百三十点一九二九 +-2930.1929~负两千九百三十点一九二九 0.5~零点五 5.0~五点零 10.567~十点五六七 -123.123~负一百二十三点一二三 3123.1231~三千一百二十三点一二三一 -123123123.12312334234~一亿二千三百一十二万三千一百二十三点一二三一二三三四二三四 +123123123.12312334234~一亿两千三百一十二万三千一百二十三点一二三一二三三四二三四 83888123.2398412~八千三百八十八万八千一百二十三点二三九八四一二 283818.28994万~二十八万三千八百一十八点二八九九四万 -28394919.2312亿~二千八百三十九万四千九百一十九点二三一二亿 +28394919.2312亿~两千八百三十九万四千九百一十九点二三一二亿 23.23万~二十三点二三万 1233.1亿~一千二百三十三点一亿 -123.213万~负一百二十三点二一三万 123.890万~一百二十三点八九零万 -233123.9940亿~负二十三万三千一百二十三点九九四零亿 283943.234123亿~二十八万三千九百四十三点二三四一二三亿 -2391.2318~二千三百九十一点二三一八 +2391.2318~两千三百九十一点二三一八 -1.5亿~负一点五亿 1.5亿~一点五亿 10.67亿~十点六七亿 16.3亿~十六点三亿 12.2亿~十二点二亿 -2342.2342亿~二千三百四十二点二三四二亿 \ No newline at end of file +2342.2342亿~两千三百四十二点二三四二亿 +公司的年收益率是6.5~公司的年收益率是六点五 +现在的室外气温是36.7摄氏度~现在的室外气温是三十六点七摄氏度 +我们可以给你返还1.2个百分点~我们可以给你返还一点二个百分点 +全球现今有71.5亿人~全球现今有七十一点五亿人 +小张的资产值13.5亿~小张的资产值十三点五亿 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_fraction.txt index bae47330d..c39f680f4 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_fraction.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_fraction.txt @@ -4,11 +4,7 @@ -1/3~负三分之一 负1/3~负三分之一 1/2~二分之一 -+1/2~正二分之一 -正1/2~正二分之一 1/10~十分之一 -+100分之1~正一百分之一 -正100分之1~正一百分之一 98%~百分之九十八 -98%~负百分之九十八 负98%~负百分之九十八 @@ -16,19 +12,15 @@ -百分之1~负百分之一 负百分之1~负百分之一 -100分之57~负一百分之五十七 -正100分之57~正一百分之五十七 负100分之57~负一百分之五十七 1/5~五分之一 -1/5~负五分之一 -+1/5~正五分之一 1又1/5~一又五分之一 -+1又1/5~正一又五分之一 -正1又1/5~正一又五分之一 5又2分之1~五又二分之一 -+5又2分之1~正五又二分之一 -正5又2分之1~正五又二分之一 -1/100~一百分之一 -+1/100~正一百分之一 -正1/100~正一百分之一 +0.4%~百分之零点四 6.3%~百分之六点三 -0.4%~百分之零点四 \ No newline at end of file +公司的年收益率是6.5%~公司的年收益率是百分之六点五 +60%的人口是男性~百分之六十的人口是男性 +全校有80%的学生来自大山深处的贫困地区~全校有百分之八十的学生来自大山深处的贫困地区 +我们的队伍有1/3的人是士官学校出身~我们的队伍有三分之一的人是士官学校出身 +今年的降雨量较往年多了5%~今年的降雨量较往年多了百分之五 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_math.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_math.txt deleted file mode 100644 index d31a3a8d4..000000000 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_math.txt +++ /dev/null @@ -1,2 +0,0 @@ -78:96~七十八比九十六 -±2~正负二 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_measure.txt index 094afc7c4..d220c406a 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_measure.txt @@ -2,4 +2,10 @@ 38°C~三十八摄氏度 120m²~一百二十平方米 10ms~十毫秒 +25千克~二十五千克 -23°C~负二十三摄氏度 +1.2g~一点二克 +测量机显示重量是25kg~测量机显示重量是二十五千克 +现在的室外温度是38°C~现在的室外温度是三十八摄氏度 +这个房子大概有120m²~这个房子大概有一百二十平方米 +整体时长大概是10ms~整体时长大概是十毫秒 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_money.txt index 28075fca1..71dd98d71 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_money.txt @@ -12,7 +12,11 @@ $100~一百美元 5角~五角 5块~五块 6毛~六毛 -5块5毛5分~五块五毛五分 -1.5万美元~一点五万美元 $1.5万~一点五万美元 -3.5万韩元~三点五万韩元 \ No newline at end of file +3.5万韩元~三点五万韩元 +1.5万美元~一点五万美元 +我现在能拿出的现金是100000美元~我现在能拿出的现金是十万美元 +一份煎饼卖5块一份~一份煎饼卖五块一份 +每100美元能兑换700人民币左右~每一百美元能兑换七百人民币左右 +您的银行账户余额为$500~您的银行账户余额为五百美元 +洛杉矶的最低工资是每小时$15~洛杉矶的最低工资是每小时十五美元 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_ordinal.txt index 57ea76bc1..e84b3dd8d 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_ordinal.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_ordinal.txt @@ -1,149 +1,84 @@ 第10000~第一万 第210~第二百一十 第5000~第五千 -第5100~第五千一百 -第5110~第五千一百一十 -第5111~第五千一百一十一 -第5010~第五千零一十 第5099~第五千零九十九 第5001~第五千零一 -第5,000~第五千 -第5,100~第五千一百 -第5,010~第五千零一十 -第5,001~第五千零一 -第50,000~第五万 -第51,000~第五万一千 -第50,100~第五万零一百 -第50,010~第五万零一十 -第51,100~第五万一千一百 -第51,110~第五万一千一百一十 -第50,011~第五万零一十一 -第50,001~第五万零一 -第50,010~第五万零一十 +第5000~第五千 +第5100~第五千一百 +第50010~第五万零一十 +第51100~第五万一千一百 +第50010~第五万零一十 第50000~第五万 第51000~第五万一千 第50100~第五万零一百 第50010~第五万零一十 -第50001~第五万零一 -第51100~第五万一千一百 -第51110~第五万一千一百一十 -第51111~第五万一千一百一十一 第50111~第五万零一百一十一 第50011~第五万零一十一 -第500,000~第五十万 -第510,000~第五十一万 -第501,000~第五十万一千 -第500,100~第五十万零一百 -第500,010~第五十万零一十 -第500,111~第五十万零一百一十一 -第501,100~第五十万一千一百 -第501,111~第五十万一千一百一十一 -第511,111~第五十一万一千一百一十一 -第100,000~第十万 -第110,000~第十一万 +第500000~第五十万 +第510000~第五十一万 +第501000~第五十万一千 +第100000~第十万 +第110000~第十一万 第500000~第五十万 第510000~第五十一万 第501000~第五十万一千 第500100~第五十万零一百 -第500010~第五十万零一十 -第500001~第五十万零一 -第500111~第五十万零一百一十一 -第501100~第五十万一千一百 第501111~第五十万一千一百一十一 第511111~第五十一万一千一百一十一 第100000~第十万 -第110000~第十一万 -第1100000~第一百一十万 -第1010000~第一百零一万 -第1001000~第一百万一千 第1000100~第一百万零一百 第1000010~第一百万零一十 第1000001~第一百万零一 第1000000~第一百万 -第1,000,000~第一百万 -第1,100,000~第一百一十万 -第1,010,000~第一百零一万 -第1,001,000~第一百万一千 -第1,000,100~第一百万零一百 -第1,000,010~第一百万零一十 -第1,000,001~第一百万零一 +第1000100~第一百万零一百 +第1000010~第一百万零一十 +第1000001~第一百万零一 +第10000000~第一千万 +第11000000~第一千一百万 +第10010001~第一千零一万零一 第10000000~第一千万 第11000000~第一千一百万 -第10100000~第一千零一十万 -第10010000~第一千零一万 -第10001000~第一千万一千 -第10010100~第一千零一万零一百 -第10010010~第一千零一万零一十 第10010001~第一千零一万零一 -第10,000,000~第一千万 -第11,000,000~第一千一百万 -第10,100,000~第一千零一十万 -第10,010,000~第一千零一万 -第10,001,000~第一千万一千 -第10,010,100~第一千零一万零一百 -第10,010,010~第一千零一万零一十 -第10,010,001~第一千零一万零一 -第101111111~第一亿一百一十一万一千一百一十一 +第101111111~第一亿零一百一十一万一千一百一十一 第110111111~第一亿一千零一十一万一千一百一十一 第111011111~第一亿一千一百零一万一千一百一十一 第111101111~第一亿一千一百一十万一千一百一十一 -第111110111~第一亿一千一百一十一万零一百一十一 -第111111011~第一亿一千一百一十一万一千零一十一 -第111111101~第一亿一千一百一十一万一千一百零一 +第101111111~第一亿零一百一十一万一千一百一十一 +第110111111~第一亿一千零一十一万一千一百一十一 +第111111110~第一亿一千一百一十一万一千一百一十 第111111111~第一亿一千一百一十一万一千一百一十一 -第101,111,111~第一亿一百一十一万一千一百一十一 -第110,111,111~第一亿一千零一十一万一千一百一十一 -第111,011,111~第一亿一千一百零一万一千一百一十一 -第111,101,111~第一亿一千一百一十万一千一百一十一 -第111,110,111~第一亿一千一百一十一万零一百一十一 -第111,111,011~第一亿一千一百一十一万一千零一十一 -第111,111,101~第一亿一千一百一十一万一千一百零一 -第111,111,110~第一亿一千一百一十一万一千一百一十 -第111,111,111~第一亿一千一百一十一万一千一百一十一 第1011111111~第十亿一千一百一十一万一千一百一十一 -第1101111111~第十一亿一百一十一万一千一百一十一 +第1101111111~第十一亿零一百一十一万一千一百一十一 第1110111111~第十一亿一千零一十一万一千一百一十一 第1111011111~第十一亿一千一百零一万一千一百一十一 第1111110111~第十一亿一千一百一十一万零一百一十一 第1111111101~第十一亿一千一百一十一万一千一百零一 第1111111111~第十一亿一千一百一十一万一千一百一十一 -第1,011,111,111~第十亿一千一百一十一万一千一百一十一 -第1,101,111,111~第十一亿一百一十一万一千一百一十一 -第1,110,111,111~第十一亿一千零一十一万一千一百一十一 -第1,111,011,111~第十一亿一千一百零一万一千一百一十一 -第1,111,101,111~第十一亿一千一百一十万一千一百一十一 -第1,111,110,111~第十一亿一千一百一十一万零一百一十一 -第1,111,111,011~第十一亿一千一百一十一万一千零一十一 -第1,111,111,101~第十一亿一千一百一十一万一千一百零一 -第1,111,111,110~第十一亿一千一百一十一万一千一百一十 +第1011111111~第十亿一千一百一十一万一千一百一十一 +第1101111111~第十一亿零一百一十一万一千一百一十一 +第1110111111~第十一亿一千零一十一万一千一百一十一 +第11000000000~第一百一十亿 +第10000100000~第一百亿零十万 第11000000000~第一百一十亿 第10100000000~第一百零一亿 第10010000000~第一百亿一千万 -第10001000000~第一百亿一百万 +第10001000000~第一百亿零一百万 第10000100000~第一百亿零十万 -第11,000,000,000~第一百一十亿 -第10,100,000,000~第一百零一亿 -第10,010,000,000~第一百亿一千万 -第10,001,000,000~第一百亿一百万 -第10,000,100,000~第一百亿零十万 -第10,000,010,000~第一百亿零一万 -第10,000,001,000~第一百亿零一千 -第10,000,000,100~第一百亿零一百 -第10,000,000,010~第一百亿零一十 -第10,000,000,001~第一百亿零一 +第10000000100~第一百亿零一百 +第10000000010~第一百亿零一十 +第10000000001~第一百亿零一 第100000100000~第一千亿零十万 第100000010000~第一千亿零一万 第100000001000~第一千亿零一千 -第100000000100~第一千亿零一百 第100000000010~第一千亿零一十 第100000000001~第一千亿零一 -第100,000,000,000~第一千亿 -第110,000,000,000~第一千一百亿 -第101,000,000,000~第一千零一十亿 -第100,100,000,000~第一千零一亿 -第100,010,000,000~第一千亿一千万 -第100,001,000,000~第一千亿一百万 -第100,000,100,000~第一千亿零十万 -第100,000,010,000~第一千亿零一万 -第100,000,001,000~第一千亿零一千 -第100,000,000,100~第一千亿零一百 \ No newline at end of file +第100000000000~第一千亿 +第110000000000~第一千一百亿 +第101000000000~第一千零一十亿 +第100100000000~第一千零一亿 +第100010000000~第一千亿一千万 +这个孩子的学习成绩一直是全年级第1~这个孩子的学习成绩一直是全年级第一 +从这一排往下数第5个就是小明~从这一排往下数第五个就是小明 +恭喜您成为本店第100名顾客~恭喜您成为本店第一百名顾客 +这是你人生的第1桶金~这是你人生的第一桶金 +这个名单从头开始到第100都是你的目标客户~这个名单从头开始到第一百都是你的目标客户 diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_preprocess.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_preprocess.txt deleted file mode 100644 index e1b592ebc..000000000 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_preprocess.txt +++ /dev/null @@ -1 +0,0 @@ -你啊好~你好 diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_time.txt index 94b45ac30..9523492a0 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_time.txt @@ -1,4 +1,4 @@ -3:4:5~三点四分五秒 +03:04:05~三点四分五秒 03:04:05~三点四分五秒 00:00:00~零点零分零秒 03:04:05~三点四分五秒 @@ -27,4 +27,8 @@ 5点差3分~五点差三分 5点差5分~五点差五分 5点差4分am~五点差四分am -3个小时15分钟30秒~三个小时十五分钟三十秒 \ No newline at end of file +3个小时15分钟30秒~三个小时十五分钟三十秒 +现在是北京时间下午03:04:05~现在是北京时间下午三点四分五秒 +航班预计会延误5个小时~航班预计会延误五个小时 +大家尽量把手表对准调到5点1刻~大家尽量把手表对准调到五点一刻 +5点1刻离六点就差十五分钟~五点一刻离六点就差十五分钟 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..1700f1af6 --- /dev/null +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,10 @@ +这附近有Atm~这附近有ATM +这是一个ufo的照片~这是一个UFO的照片 +nba比赛如期举行~NBA比赛如期举行 +我们需要升级gpu~我们需要升级GPU +他是这个公司的c e o~他是这个公司的CEO +我们已经加入了wto~我们已经加入了WTO +小王以优秀的战绩成为这场游戏的mvp~小王以优秀的战绩成为这场游戏的MVP +这位客人是我们的vip~这位客人是我们的VIP +小王的iq是全班最高的~小王的IQ是全班最高的 +小李读了一个mba~小李读了一个MBA \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt index 4fedd2cd9..23270bf82 100644 --- a/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt +++ b/tests/nemo_text_processing/zh/data_text_normalization/test_cases_word.txt @@ -1,2 +1,7 @@ 你~你 -好~好 \ No newline at end of file +好~好 +你好今天的天气不错~你好今天的天气不错 +只有智商超过一定数值的人才能破解~只有智商超过一定数值的人才能破解 +这是由人工智能控制的系统~这是由人工智能控制的系统 +欧洲旅游目的地多到不知道怎么选~欧洲旅游目的地多到不知道怎么选 +马斯科卖掉豪宅住进折叠屋~马斯科卖掉豪宅住进折叠屋 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/test_math.py b/tests/nemo_text_processing/zh/test_math.py deleted file mode 100644 index cf44a5c22..000000000 --- a/tests/nemo_text_processing/zh/test_math.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from parameterized import parameterized - -from nemo_text_processing.text_normalization.normalize import Normalizer - -from ..utils import CACHE_DIR, parse_test_case_file - - -class TestMath: - normalizer_zh = Normalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') - - @parameterized.expand(parse_test_case_file('zh/data_text_normalization/test_cases_math.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_norm_math(self, test_input, expected): - preds = self.normalizer_zh.normalize(test_input) - assert expected == preds diff --git a/tests/nemo_text_processing/zh/test_preprocess.py b/tests/nemo_text_processing/zh/test_preprocess.py deleted file mode 100644 index 34838cc90..000000000 --- a/tests/nemo_text_processing/zh/test_preprocess.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from parameterized import parameterized - -from nemo_text_processing.text_normalization.normalize import Normalizer - -from ..utils import CACHE_DIR, parse_test_case_file - - -class TestPreprocess: - normalizer_zh = Normalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') - - @parameterized.expand(parse_test_case_file('zh/data_text_normalization/test_cases_preprocess.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_norm_preprocess(self, test_input, expected): - preds = self.normalizer_zh.normalize(test_input) - assert expected == preds diff --git a/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh index 4cbbf0d0d..dd352b42b 100644 --- a/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/zh/test_sparrowhawk_normalization.sh @@ -12,7 +12,7 @@ runtest () { while read testcase; do IFS='~' read written spoken <<< $testcase # replace non breaking space with breaking space - denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration_pp.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') # # trim white space spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" @@ -23,35 +23,42 @@ runtest () { done < "$input" } -testTNMoneyText() { - input=$PROJECT_DIR/zh/data_text_normalization/test_cases_money.txt +testTNTimeText() { + input=$PROJECT_DIR/zh/data_text_normalization/test_cases_time.txt runtest $input } -testTNCharText() { - input=$PROJECT_DIR/zh/data_text_normalization/test_cases_char.txt +testTNCardinalText() { + input=$PROJECT_DIR/zh/data_text_normalization/test_cases_cardinal.txt runtest $input } -testTNTimeText() { - input=$PROJECT_DIR/zh/data_text_normalization/test_cases_time.txt +testTNOrdinalText() { + input=$PROJECT_DIR/zh/data_text_normalization/test_cases_ordinal.txt runtest $input } -testTNDateText() { - input=$PROJECT_DIR/zh/data_text_normalization/test_cases_date.txt +testTNDecimalalText() { + input=$PROJECT_DIR/zh/data_text_normalization/test_cases_decimal.txt runtest $input } -# testTNMathText() { -# input=$PROJECT_DIR/zh/data_text_normalization/test_cases_math.txt -# runtest $input -# } testTNFractionText() { input=$PROJECT_DIR/zh/data_text_normalization/test_cases_fraction.txt runtest $input } - -# testTNPreprocessText() { -# input=$PROJECT_DIR/zh/data_text_normalization/test_cases_preprocess.txt -# runtest $input -# } +testTNDateText() { + input=$PROJECT_DIR/zh/data_text_normalization/test_cases_date.txt + runtest $input +} +testTNMoneyText() { + input=$PROJECT_DIR/zh/data_text_normalization/test_cases_money.txt + runtest $input +} +testTNWordText() { + input=$PROJECT_DIR/zh/data_text_normalization/test_cases_word.txt + runtest $input +} +testTNWhitelistText() { + input=$PROJECT_DIR/zh/data_text_normalization/test_cases_whitelist.txt + runtest $input +} testTNMeasureText() { input=$PROJECT_DIR/zh/data_text_normalization/test_cases_measure.txt runtest $input @@ -59,4 +66,5 @@ testTNMeasureText() { # Load shUnit2 -. $PROJECT_DIR/../shunit2/shunit2 +#. $PROJECT_DIR/../shunit2/shunit2 +. /workspace/shunit2/shunit2 diff --git a/tests/nemo_text_processing/zh/test_time.py b/tests/nemo_text_processing/zh/test_time.py index ed285983b..590fd591f 100644 --- a/tests/nemo_text_processing/zh/test_time.py +++ b/tests/nemo_text_processing/zh/test_time.py @@ -31,11 +31,11 @@ def test_norm_time(self, test_input, expected): preds = self.normalizer_zh.normalize(test_input) assert expected == preds - # inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) - - # @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_time.txt')) - # @pytest.mark.run_only_on('CPU') - # @pytest.mark.unit - # def test_denorm(self, test_input, expected): - # pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - # assert pred == expected + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_whitelist.py b/tests/nemo_text_processing/zh/test_whitelist.py index 8e6087f53..deb857e7a 100644 --- a/tests/nemo_text_processing/zh/test_whitelist.py +++ b/tests/nemo_text_processing/zh/test_whitelist.py @@ -1,32 +1,41 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pytest -from parameterized import parameterized - -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer - -from ..utils import CACHE_DIR, parse_test_case_file - - -class TestWhitelist: - inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) - - @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_whitelist.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred == expected +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDate: + normalizer_zh = Normalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + + @parameterized.expand(parse_test_case_file('zh/data_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm_date(self, test_input, expected): + preds = self.normalizer_zh.normalize(test_input) + assert expected == preds + + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_word.py b/tests/nemo_text_processing/zh/test_word.py index ddf587857..3314ea90b 100644 --- a/tests/nemo_text_processing/zh/test_word.py +++ b/tests/nemo_text_processing/zh/test_word.py @@ -17,6 +17,7 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -30,3 +31,12 @@ class TestWord: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer_zh = Normalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + + @parameterized.expand(parse_test_case_file('zh/data_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm_date(self, test_input, expected): + preds = self.normalizer_zh.normalize(test_input) + assert expected == preds diff --git a/tools/text_processing_deployment/Dockerfile b/tools/text_processing_deployment/Dockerfile index 0fa7d855b..a3badfde5 100644 --- a/tools/text_processing_deployment/Dockerfile +++ b/tools/text_processing_deployment/Dockerfile @@ -32,7 +32,6 @@ RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/pr RUN tar xzvf protobuf-2.5.0.tar.gz RUN cd protobuf-2.5.0 && ./configure && make && make install && ldconfig RUN conda install -c conda-forge thrax=1.3.4 -y -RUN git clone https://github.com/yzhang123/sparrowhawk.git -RUN cd sparrowhawk && git checkout test && apt-get install -y autoconf && bash autoreconf && ./configure && make && make install && ldconfig +RUN git clone https://github.com/anand-nv/sparrowhawk.git && cd sparrowhawk && git checkout nemo_tests && apt-get install -y autoconf && bash autoreconf && ./configure && make && make install && ldconfig RUN git clone https://github.com/kward/shunit2.git RUN echo "DONE" diff --git a/tools/text_processing_deployment/docker/launch.sh b/tools/text_processing_deployment/docker/launch.sh index 98fdff534..1bb4c78ca 100644 --- a/tools/text_processing_deployment/docker/launch.sh +++ b/tools/text_processing_deployment/docker/launch.sh @@ -50,11 +50,11 @@ elif [[ $MODE == "test_itn_grammars" ]]; then fi echo $MOUNTS -docker run -it --rm \ +docker run -it -e LANG=C.UTF-8 -e LC_ALL=C.UTF-8 --rm \ --shm-size=4g \ --ulimit memlock=-1 \ --ulimit stack=67108864 \ $MOUNTS \ -v $SCRIPT_DIR/../../../tests/nemo_text_processing/:/workspace/tests/ \ -w $WORK_DIR \ - sparrowhawk:latest $CMD \ No newline at end of file + sparrowhawk:latest $CMD diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index c40d5d6d0..427bbaf6e 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -52,6 +52,8 @@ def tn_grammars(**kwargs): ).fst } d['verbalize'] = {'ALL': TNVerbalizeFst(deterministic=True).fst, 'REDUP': pynini.accep("REDUP")} + if TNPostProcessingFst is not None: + d['post_process'] = {'POSTPROCESSOR': TNPostProcessingFst().fst} return d @@ -66,6 +68,8 @@ def export_grammars(output_dir, grammars): for category, graphs in grammars.items(): out_dir = os.path.join(output_dir, category) + if category == "post_process": + out_dir = os.path.join(output_dir, "verbalize") if not os.path.exists(out_dir): os.makedirs(out_dir) time.sleep(1) @@ -113,7 +117,7 @@ def parse_args(): if args.language in ['pt', 'ru', 'vi', 'es_en', 'mr'] and args.grammars == 'tn_grammars': raise ValueError('Only ITN grammars could be deployed in Sparrowhawk for the selected languages.') - + TNPostProcessingFst = None if args.language == 'en': from nemo_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import ( ClassifyFst as ITNClassifyFst, @@ -124,7 +128,11 @@ def parse_args(): from nemo_text_processing.text_normalization.en.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, ) + from nemo_text_processing.text_normalization.en.verbalizers.post_processing import ( + PostProcessingFst as TNPostProcessingFst, + ) from nemo_text_processing.text_normalization.en.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst + elif args.language == 'de': from nemo_text_processing.inverse_text_normalization.de.taggers.tokenize_and_classify import ( ClassifyFst as ITNClassifyFst, @@ -205,6 +213,9 @@ def parse_args(): from nemo_text_processing.text_normalization.zh.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, ) + from nemo_text_processing.text_normalization.zh.verbalizers.post_processing import ( + PostProcessingFst as TNPostProcessingFst, + ) from nemo_text_processing.text_normalization.zh.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst elif args.language == 'ar': from nemo_text_processing.inverse_text_normalization.ar.taggers.tokenize_and_classify import ( @@ -242,10 +253,6 @@ def parse_args(): from nemo_text_processing.inverse_text_normalization.hy.verbalizers.verbalize import ( VerbalizeFst as ITNVerbalizeFst, ) - from nemo_text_processing.text_normalization.hy.taggers.tokenize_and_classify import ( - ClassifyFst as TNClassifyFst, - ) - from nemo_text_processing.text_normalization.hy.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") export_grammars( output_dir=output_dir,