From f3ac5ca45e0307c8d636a07761ddd39ddb4f1f1b Mon Sep 17 00:00:00 2001 From: Alex Cui Date: Wed, 8 Feb 2023 20:00:09 +0530 Subject: [PATCH 01/89] Add ZH ITN Signed-off-by: Anand Joseph --- .../inverse_text_normalization/zh/__init__.py | 17 + .../zh/clean_eval_data.py | 342 ++++++++++++++++ .../zh/data/__init__.py | 13 + .../zh/data/date/__init__.py | 13 + .../zh/data/date/day-nano.tsv | 74 ++++ .../zh/data/date/month-nano.tsv | 49 +++ .../zh/data/money/__init__.py | 13 + .../zh/data/money/currency_major-nano.tsv | 77 ++++ .../zh/data/money/currency_minor-nano.tsv | 10 + .../money/currency_rmb_minor_cent-nano.tsv | 1 + .../money/currency_rmb_minor_tencent-nano.tsv | 2 + .../zh/data/numbers/__init__.py | 13 + .../zh/data/numbers/digit-nano.tsv | 24 ++ .../zh/data/numbers/tens-nano.tsv | 23 ++ .../zh/data/numbers/tens_re-nano.tsv | 20 + .../zh/data/numbers/zero-nano.tsv | 1 + .../zh/data/time/__init__.py | 13 + .../zh/data/time/hours_to-nano.tsv | 25 ++ .../zh/data/time/minutes_to-nano.tsv | 59 +++ .../zh/data/time/time_hours-nano.tsv | 55 +++ .../zh/data/time/time_minutes-nano.tsv | 163 ++++++++ .../zh/graph_utils.py | 196 +++++++++ .../zh/taggers/__init__.py | 13 + .../zh/taggers/cardinal.py | 371 ++++++++++++++++++ .../zh/taggers/date.py | 90 +++++ .../zh/taggers/decimal.py | 111 ++++++ .../zh/taggers/fraction.py | 47 +++ .../zh/taggers/money.py | 130 ++++++ .../zh/taggers/ordinal.py | 33 ++ .../zh/taggers/punctuation.py | 35 ++ .../zh/taggers/time.py | 154 ++++++++ .../zh/taggers/tokenize_and_classify.py | 120 ++++++ .../zh/taggers/word.py | 30 ++ .../inverse_text_normalization/zh/utils.py | 47 +++ .../zh/verbalizers/__init__.py | 13 + .../zh/verbalizers/cardinal.py | 95 +++++ .../zh/verbalizers/date.py | 87 ++++ .../zh/verbalizers/decimal.py | 92 +++++ .../zh/verbalizers/fraction.py | 62 +++ .../zh/verbalizers/money.py | 70 ++++ .../zh/verbalizers/ordinal.py | 40 ++ .../zh/verbalizers/time.py | 84 ++++ .../zh/verbalizers/verbalize.py | 70 ++++ .../zh/verbalizers/verbalize_final.py | 44 +++ .../zh/verbalizers/word.py | 38 ++ .../test_cases_cardinal.txt | 130 ++++++ .../test_cases_date.txt | 31 ++ .../test_cases_decimal.txt | 42 ++ .../test_cases_fraction.txt | 20 + .../test_cases_money.txt | 49 +++ .../test_cases_ordinal.txt | 57 +++ .../test_cases_time.txt | 22 ++ .../test_cases_word.txt | 49 +++ .../nemo_text_processing/zh/test_cardinal.py | 31 ++ tests/nemo_text_processing/zh/test_date.py | 11 + tests/nemo_text_processing/zh/test_decimal.py | 30 ++ .../nemo_text_processing/zh/test_fraction.py | 9 + tests/nemo_text_processing/zh/test_money.py | 11 + tests/nemo_text_processing/zh/test_ordinal.py | 30 ++ ..._sparrowhawk_inverse_text_normalization.sh | 84 ++++ tests/nemo_text_processing/zh/test_time.py | 10 + .../nemo_text_processing/zh/test_whitelist.py | 31 ++ tests/nemo_text_processing/zh/test_word.py | 31 ++ 63 files changed, 3757 insertions(+) create mode 100644 nemo_text_processing/inverse_text_normalization/zh/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/clean_eval_data.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/date/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/date/day-nano.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/date/month-nano.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/money/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major-nano.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/money/currency_minor-nano.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_cent-nano.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_tencent-nano.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/numbers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/numbers/digit-nano.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/numbers/tens-nano.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/numbers/tens_re-nano.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/numbers/zero-nano.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/time/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/time/hours_to-nano.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/time/minutes_to-nano.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/time/time_hours-nano.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/time/time_minutes-nano.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/zh/graph_utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/taggers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/taggers/date.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/taggers/money.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/taggers/time.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/taggers/word.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/verbalizers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py create mode 100644 nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py create mode 100644 tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_cardinal.txt create mode 100644 tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_date.txt create mode 100644 tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_decimal.txt create mode 100644 tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_fraction.txt create mode 100644 tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_money.txt create mode 100644 tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_ordinal.txt create mode 100644 tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_time.txt create mode 100644 tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_word.txt create mode 100644 tests/nemo_text_processing/zh/test_cardinal.py create mode 100644 tests/nemo_text_processing/zh/test_decimal.py create mode 100644 tests/nemo_text_processing/zh/test_ordinal.py create mode 100644 tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh create mode 100644 tests/nemo_text_processing/zh/test_whitelist.py create mode 100644 tests/nemo_text_processing/zh/test_word.py diff --git a/nemo_text_processing/inverse_text_normalization/zh/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/__init__.py new file mode 100644 index 000000000..c07b8e4c2 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/zh/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/zh/clean_eval_data.py new file mode 100644 index 000000000..d9bc2fccb --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/clean_eval_data.py @@ -0,0 +1,342 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from argparse import ArgumentParser +from typing import List + +import regex as re +from nemo_text_processing.text_normalization.data_loader_utils import ( + EOS_TYPE, + Instance, + load_files, + training_data_to_sentences, +) + + +""" +This file is for evaluation purposes. +filter_loaded_data() cleans data (list of instances) for inverse text normalization. Filters and cleaners can be specified for each semiotic class individually. +For example, normalized text should only include characters and whitespace characters but no punctuation. + Cardinal unnormalized instances should contain at least one integer and all other characters are removed. +""" + + +class Filter: + """ + Filter class + + Args: + class_type: semiotic class used in dataset + process_func: function to transform text + filter_func: function to filter text + + """ + + def __init__(self, class_type: str, process_func: object, filter_func: object): + self.class_type = class_type + self.process_func = process_func + self.filter_func = filter_func + + def filter(self, instance: Instance) -> bool: + """ + filter function + + Args: + filters given instance with filter function + + Returns: True if given instance fulfills criteria or does not belong to class type + """ + if instance.token_type != self.class_type: + return True + return self.filter_func(instance) + + def process(self, instance: Instance) -> Instance: + """ + process function + + Args: + processes given instance with process function + + Returns: processed instance if instance belongs to expected class type or original instance + """ + if instance.token_type != self.class_type: + return instance + return self.process_func(instance) + + +def filter_cardinal_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_cardinal_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r"[^0-9]", "", un_normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_ordinal_1(instance: Instance) -> bool: + ok = re.search(r"(st|nd|rd|th)\s*$", instance.un_normalized) + return ok + + +def process_ordinal_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r"[,\s]", "", un_normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_decimal_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_decimal_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + un_normalized = re.sub(r",", "", un_normalized) + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_measure_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_measure_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r",", "", un_normalized) + un_normalized = re.sub(r"m2", "m²", un_normalized) + un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized) + normalized = re.sub(r"[^a-z\s]", "", normalized) + normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_money_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_money_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r",", "", un_normalized) + un_normalized = re.sub(r"a\$", r"$", un_normalized) + un_normalized = re.sub(r"us\$", r"$", un_normalized) + un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized) + un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_time_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_time_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + un_normalized = re.sub(r": ", ":", un_normalized) + un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized) + un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized) + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_plain_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_plain_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_punct_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_punct_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_date_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_date_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + un_normalized = re.sub(r",", "", un_normalized) + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_letters_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_letters_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_verbatim_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_verbatim_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_digit_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_digit_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_telephone_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_telephone_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_electronic_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_electronic_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_fraction_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_fraction_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_address_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_address_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +filters = [] +filters.append(Filter(class_type="CARDINAL", process_func=process_cardinal_1, filter_func=filter_cardinal_1)) +filters.append(Filter(class_type="ORDINAL", process_func=process_ordinal_1, filter_func=filter_ordinal_1)) +filters.append(Filter(class_type="DECIMAL", process_func=process_decimal_1, filter_func=filter_decimal_1)) +filters.append(Filter(class_type="MEASURE", process_func=process_measure_1, filter_func=filter_measure_1)) +filters.append(Filter(class_type="MONEY", process_func=process_money_1, filter_func=filter_money_1)) +filters.append(Filter(class_type="TIME", process_func=process_time_1, filter_func=filter_time_1)) + +filters.append(Filter(class_type="DATE", process_func=process_date_1, filter_func=filter_date_1)) +filters.append(Filter(class_type="PLAIN", process_func=process_plain_1, filter_func=filter_plain_1)) +filters.append(Filter(class_type="PUNCT", process_func=process_punct_1, filter_func=filter_punct_1)) +filters.append(Filter(class_type="LETTERS", process_func=process_letters_1, filter_func=filter_letters_1)) +filters.append(Filter(class_type="VERBATIM", process_func=process_verbatim_1, filter_func=filter_verbatim_1)) +filters.append(Filter(class_type="DIGIT", process_func=process_digit_1, filter_func=filter_digit_1)) +filters.append(Filter(class_type="TELEPHONE", process_func=process_telephone_1, filter_func=filter_telephone_1)) +filters.append(Filter(class_type="ELECTRONIC", process_func=process_electronic_1, filter_func=filter_electronic_1)) +filters.append(Filter(class_type="FRACTION", process_func=process_fraction_1, filter_func=filter_fraction_1)) +filters.append(Filter(class_type="ADDRESS", process_func=process_address_1, filter_func=filter_address_1)) +filters.append(Filter(class_type=EOS_TYPE, process_func=lambda x: x, filter_func=lambda x: True)) + + +def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]: + """ + Filters list of instances + + Args: + data: list of instances + + Returns: filtered and transformed list of instances + """ + updates_instances = [] + for instance in data: + updated_instance = False + for fil in filters: + if fil.class_type == instance.token_type and fil.filter(instance): + instance = fil.process(instance) + updated_instance = True + if updated_instance: + if verbose: + print(instance) + updates_instances.append(instance) + return updates_instances + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument("--input", help="input file path", type=str, default='./en_with_types/output-00001-of-00100') + parser.add_argument("--verbose", help="print filtered instances", action='store_true') + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + file_path = args.input + + print("Loading training data: " + file_path) + instance_list = load_files([file_path]) # List of instances + filtered_instance_list = filter_loaded_data(instance_list, args.verbose) + training_data_to_sentences(filtered_instance_list) diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/data/__init__.py new file mode 100644 index 000000000..4fc50543f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/date/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/data/date/__init__.py new file mode 100644 index 000000000..4fc50543f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/date/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/date/day-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/date/day-nano.tsv new file mode 100644 index 000000000..fd3e3ddab --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/date/day-nano.tsv @@ -0,0 +1,74 @@ +一 1 +二 2 +三 3 +四 4 +五 5 +六 6 +七 7 +八 8 +九 9 +十 10 +十一 11 +十二 12 +十三 13 +十四 14 +十五 15 +十六 16 +十七 17 +十八 18 +十九 19 +二十 20 +二十一 21 +二十二 22 +二十三 23 +二十四 24 +二十五 25 +二十六 26 +二十七 27 +二十八 28 +二十九 29 +三十 30 +三十一 31 +壹 1 +貳 2 +參 3 +肆 4 +伍 5 +陸 6 +柒 7 +捌 8 +玖 9 +幺 1 +两 2 +兩 2 +拾 10 +拾壹 11 +拾貳 12 +拾叁 13 +拾肆 14 +拾伍 15 +拾陸 16 +拾柒 17 +拾捌 18 +拾玖 19 +貳拾 20 +貳拾壹 21 +貳拾貳 22 +貳拾叁 23 +貳拾肆 24 +貳拾伍 25 +貳拾陸 26 +貳拾柒 27 +貳拾捌 28 +貳拾玖 29 +叁拾 30 +叁拾壹 31 +壹 1 +拾壹 11 +贰拾壹 21 +贰 2 +陆 6 +拾贰 12 +拾陆 16 +贰拾贰 22 +贰拾陆 26 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/date/month-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/date/month-nano.tsv new file mode 100644 index 000000000..5b2f33539 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/date/month-nano.tsv @@ -0,0 +1,49 @@ +一 1 +二 2 +三 3 +四 4 +五 5 +六 6 +七 7 +八 8 +九 9 +十 10 +十一 11 +十二 12 +一十 10 +零一 1 +零二 2 +零三 3 +零四 4 +零五 5 +零六 6 +零七 7 +零八 8 +零九 9 +壹 1 +贰 2 +叁 3 +肆 4 +伍 5 +陆 6 +柒 7 +捌 8 +玖 9 +拾 10 +拾壹 11 +拾贰 12 +壹拾 10 +零壹 1 +零贰 2 +零叁 3 +零肆 4 +零伍 5 +零陆 6 +零柒 7 +零捌 8 +零玖 9 +貳 2 +零貳 2 +陸 6 +零陸 6 +拾貳 12 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/money/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/data/money/__init__.py new file mode 100644 index 000000000..4fc50543f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/money/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major-nano.tsv new file mode 100644 index 000000000..22d7a0579 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major-nano.tsv @@ -0,0 +1,77 @@ +美元 US$ +欧元 € +歐元 € +英镑 £ +英鎊 £ +加拿大元 CAD$ +加拿大币 CAD$ +加拿大幣 CAD$ +加元 CAD$ +加币 CAD$ +加幣 CAD$ +瑞士法郎 Fr +法郎 ₣ +圆 ¥ +圓 ¥ +瑞典克朗 Kr +墨西哥比索 NXN$ +新西兰元 NZD$ +新西蘭元 NZD$ +新加坡币 SGD$ +新加坡幣 SGD$ +新加坡元 SGD$ +港元 HKD$ +港币 HKD$ +港幣 HKD$ +挪威克朗 NOKkr +韩元 ₩ +韓元 ₩ +韩币 ₩ +韓幣 ₩ +土耳其里拉 TRY₺ +印度卢布 ₹ +印度盧布 ₹ +印度卢比 ₹ +印度盧比 ₹ +俄罗斯卢布 ₽ +俄羅斯盧布 ₽ +俄罗斯卢比 ₽ +俄羅斯盧比 ₽ +巴西雷亚尔 BRLR$ +巴西雷亞爾 BRLR$ +南非兰特 R +南非蘭特 R +丹麦克朗 DKKkr +丹麥克朗 DKKkr +波兰兹罗提 zł +波蘭兹儸提 zł +新台币 TWDNT$ +新臺幣 TWDNT$ +泰铢 ฿ +泰銖 ฿ +马来西亚林吉特 RM +馬來西亞林吉特 RM +印尼盾 Rp +匈牙利福林 Ft +捷克克朗 Kč +以色列新谢克尔 ₪ +以色列新謝克爾 ₪ +智利披索 CLP$ +菲律宾披索 ₱ +菲律賓披索 ₱ +阿联酋迪拉姆 د.إ +阿聯酋迪拉姆 د.إ +哥伦比亚披索 COL$ +哥倫比亞披索 COL$ +马来西亚令吉 RM +馬來西亞令吉 RM +罗马尼亚列伊 L +羅馬尼亞列伊 L +日元 JPY¥ +日圆 JPY¥ +日圓 JPY¥ +元 ¥ +圓 ¥ +圆 ¥ +人民币 ¥ +人民幣 ¥ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_minor-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_minor-nano.tsv new file mode 100644 index 000000000..f39777e21 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_minor-nano.tsv @@ -0,0 +1,10 @@ +美分 US$ +欧分 € +便士 £ +加拿大分 CAD$ +生丁 ₣ +瑞典欧尔 KrOre +分 NXN$ +新西兰仙 NZD$ +挪威欧尔 NOKOre +分 ¥ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_cent-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_cent-nano.tsv new file mode 100644 index 000000000..dd65818fd --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_cent-nano.tsv @@ -0,0 +1 @@ +分 ¥ diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_tencent-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_tencent-nano.tsv new file mode 100644 index 000000000..2f0e91476 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_tencent-nano.tsv @@ -0,0 +1,2 @@ +毛 ¥ +角 ¥ diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/__init__.py new file mode 100644 index 000000000..bc443be41 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/digit-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/digit-nano.tsv new file mode 100644 index 000000000..04a36a2ce --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/digit-nano.tsv @@ -0,0 +1,24 @@ +一 1 +二 2 +三 3 +四 4 +五 5 +六 6 +七 7 +八 8 +九 9 +壹 1 +貳 2 +贰 2 +參 3 +叁 3 +肆 4 +伍 5 +陸 6 +陆 6 +柒 7 +捌 8 +玖 9 +幺 1 +两 2 +兩 2 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/tens-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/tens-nano.tsv new file mode 100644 index 000000000..a390e08d9 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/tens-nano.tsv @@ -0,0 +1,23 @@ +十 1 +一十 1 +二十 2 +三十 3 +四十 4 +五十 5 +六十 6 +七十 7 +八十 8 +九十 9 +拾 1 +壹拾 1 +貳拾 2 +贰拾 2 +叁拾 3 +參拾 3 +肆拾 4 +伍拾 5 +陸拾 6 +陆拾 6 +柒拾 7 +捌拾 8 +玖拾 9 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/tens_re-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/tens_re-nano.tsv new file mode 100644 index 000000000..58dbe8879 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/tens_re-nano.tsv @@ -0,0 +1,20 @@ +一十 1 +二十 2 +三十 3 +四十 4 +五十 5 +六十 6 +七十 7 +八十 8 +九十 9 +壹拾 1 +贰拾 2 +叁拾 3 +肆拾 4 +伍拾 5 +陆拾 6 +柒拾 7 +捌拾 8 +玖拾 9 +貳拾 2 +陸拾 6 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/zero-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/zero-nano.tsv new file mode 100644 index 000000000..4b4120706 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/zero-nano.tsv @@ -0,0 +1 @@ +零 0 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/data/time/__init__.py new file mode 100644 index 000000000..4fc50543f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/hours_to-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/time/hours_to-nano.tsv new file mode 100644 index 000000000..a56219579 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/hours_to-nano.tsv @@ -0,0 +1,25 @@ +1 0 +2 1 +3 2 +4 3 +5 4 +6 5 +7 6 +8 7 +9 8 +10 9 +11 10 +12 11 +13 12 +14 13 +15 14 +16 15 +17 16 +18 17 +19 18 +20 19 +21 20 +22 21 +23 22 +24 23 +0 23 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/minutes_to-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/time/minutes_to-nano.tsv new file mode 100644 index 000000000..11ac8f2a9 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/minutes_to-nano.tsv @@ -0,0 +1,59 @@ +59 01 +58 02 +57 03 +56 04 +55 05 +54 06 +53 07 +52 08 +51 09 +50 10 +49 11 +48 12 +47 13 +46 14 +45 15 +44 16 +43 17 +42 18 +41 19 +40 20 +39 21 +38 22 +37 23 +36 24 +35 25 +34 26 +33 27 +32 28 +31 29 +30 30 +29 31 +28 32 +27 33 +26 34 +25 35 +24 36 +23 37 +22 38 +21 39 +20 40 +19 41 +18 42 +17 43 +16 44 +15 45 +14 46 +13 47 +12 48 +11 49 +10 50 +09 51 +08 52 +07 53 +06 54 +05 55 +04 56 +03 57 +02 58 +01 59 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/time_hours-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_hours-nano.tsv new file mode 100644 index 000000000..4a00dc817 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_hours-nano.tsv @@ -0,0 +1,55 @@ +一 1 +二 2 +三 3 +四 4 +五 5 +六 6 +七 7 +八 8 +九 9 +十 10 +十一 11 +十二 12 +十三 13 +十四 14 +十五 15 +十六 16 +十七 17 +十八 18 +十九 19 +二十 20 +二十一 21 +二十二 22 +二十三 23 +二十四 24 +壹 1 +貳 2 +參 3 +肆 4 +伍 5 +陸 6 +柒 7 +捌 8 +玖 9 +拾 10 +拾壹 11 +拾貳 12 +拾叁 13 +拾肆 14 +拾伍 15 +拾陸 16 +拾柒 17 +拾捌 18 +拾玖 19 +二十 20 +二十一 21 +二十二 22 +二十三 23 +二十四 24 +兩 2 +两 2 +贰 2 +陆 6 +拾贰 12 +拾陆 16 +贰拾贰 22 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/time_minutes-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_minutes-nano.tsv new file mode 100644 index 000000000..808d9394d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_minutes-nano.tsv @@ -0,0 +1,163 @@ +一 01 +二 02 +三 03 +四 04 +五 05 +六 06 +七 07 +八 08 +九 09 +十 10 +十一 11 +十二 12 +十三 13 +十四 14 +十五 15 +十六 16 +十七 17 +十八 18 +十九 19 +二十 20 +二十一 21 +二十二 22 +二十三 23 +二十四 24 +二十五 25 +二十六 26 +二十七 27 +二十八 28 +二十九 29 +三十 30 +三十一 31 +三十二 32 +三十三 33 +三十四 34 +三十五 35 +三十六 36 +三十七 37 +三十八 38 +三十九 39 +四十 40 +四十一 41 +欸十二 42 +四十三 43 +四十四 44 +四十五 45 +四十六 46 +四十七 47 +四十八 48 +四十九 49 +五十 50 +五十一 51 +五二十 52 +五十三 53 +五十四 54 +五十五 55 +五十六 56 +五十七 57 +五十八 58 +五十九 59 +六十 60 +一十 10 +零一 01 +零二 02 +零三 03 +零四 04 +零五 05 +零六 06 +零七 07 +零八 08 +零九 09 +壹 01 +贰 02 +叁 03 +肆 04 +伍 05 +陆 06 +柒 07 +捌 08 +玖 09 +拾 10 +拾壹 11 +拾贰 12 +拾叁 13 +拾肆 14 +拾伍 15 +拾陆 16 +拾柒 17 +拾捌 18 +拾玖 19 +贰拾 20 +贰拾壹 21 +贰拾贰 22 +贰拾叁 23 +贰拾肆 24 +贰拾伍 25 +贰拾陆 26 +贰拾柒 27 +贰拾捌 28 +贰拾玖 29 +叁拾 30 +叁拾壹 31 +叁拾贰 32 +叁拾叁 33 +叁拾肆 34 +叁拾伍 35 +叁拾陆 36 +叁拾柒 37 +叁拾捌 38 +叁拾玖 39 +肆拾 40 +肆拾壹 41 +肆拾贰 42 +肆拾叁 43 +肆拾肆 44 +肆拾伍 45 +肆拾陆 46 +肆拾柒 47 +肆拾捌 48 +肆拾玖 49 +伍拾 50 +伍拾壹 51 +伍拾贰 52 +伍拾叁 53 +伍拾肆 54 +伍拾伍 55 +伍拾陆 56 +伍拾柒 57 +伍拾捌 58 +伍拾玖 59 +陆拾 60 +壹拾 10 +零壹 01 +零贰 02 +零叁 03 +零肆 04 +零伍 05 +零陆 06 +零柒 07 +零捌 08 +零玖 09 +貳 02 +零貳 02 +陸 06 +零陸 06 +拾貳 12 +貳拾貳 22 +貳拾 20 +貳拾壹 21 +貳拾叁 23 +貳拾肆 24 +貳拾伍 25 +貳拾陸 26 +貳拾柒 27 +貳拾捌 28 +貳拾玖 29 +叁拾貳 32 +肆拾貳 42 +吳氏貳 52 +拾陸 16 +叁拾陸 36 +肆拾陸 46 +伍拾陸 56 +陸拾 60 diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py new file mode 100644 index 000000000..dfef9dc9b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -0,0 +1,196 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from nemo_text_processing.text_normalization.en.utils import get_abs_path +from pynini import Far +from pynini.examples import plurals +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +NEMO_CHAR = utf8.VALID_UTF8_CHAR + +NEMO_DIGIT = byte.DIGIT +NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() +NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() +NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() +NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_NON_BREAKING_SPACE = u"\u00A0" +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"")) +) + +# suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) +# _v = pynini.union("a", "e", "i", "o", "u") +# _c = pynini.union( +# "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z" +# ) +# _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") +# _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") +# _s = NEMO_SIGMA + pynutil.insert("s") + +# graph_plural = plurals._priority_union( +# plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA +# ).optimize() + +# SINGULAR_TO_PLURAL = graph_plural +# PLURAL_TO_SINGULAR = pynini.invert(graph_plural) +# TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]) +# TO_UPPER = pynini.invert(TO_LOWER) +# MIN_NEG_WEIGHT = -0.0001 +# MIN_POS_WEIGHT = 0.0001 + + +def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + print(f'Created {file_name}') + + +# def get_plurals(fst): +# """ +# Given singular returns plurals# +# +# Args: +# fst: Fst# +# +# Returns plurals to given singular forms +# """ +# return SINGULAR_TO_PLURAL @ fst + + +# def get_singulars(fst): +# """ +# Given plural returns singulars# +# +# Args: +# fst: Fst# +# +# Returns singulars to given plural forms +# """ +# return PLURAL_TO_SINGULAR @ fst + + +def convert_space(fst) -> 'pynini.FstLike': + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = str + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far') + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> 'pynini.FstLike': + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> 'pynini.FstLike': + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> 'pynini.FstLike': + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/__init__.py new file mode 100644 index 000000000..4fc50543f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py new file mode 100644 index 000000000..d21faa491 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -0,0 +1,371 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini + +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst +from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path +from pynini.lib import pynutil + + +class CardinalFst(GraphFst): + def __init__(self): + """ + Fitite state transducer for classifying cardinals (e.g., 负五十 -> cardinal { negative: "-"integer: "50" }) + This class converts cardinals up to hundred millions (i.e., (10**10)) + Single unit digits are not converted (e.g., 五 -> 五) + Numbers less than 20 are not converted. + 二十 (2 characters/logograms) is kept as it is but 二十一 (3 characters/logograms) would become 21 + """ + super().__init__(name="cardinal", kind="classify") + + # number of digits to be processed + delete_hundreds = pynutil.delete("百") | pynutil.delete("佰") + closure_thousands = pynini.accep("千") | pynini.accep("仟") + delete_thousands = pynutil.delete("千") | pynutil.delete("仟") + closure_ten_thousands = pynini.accep("萬") | pynini.accep("万") + delete_ten_thousands = pynutil.delete("萬") | pynutil.delete("万") + closure_hundred_millions = pynini.accep("亿") | pynini.accep("億") + delete_hundred_millions = pynutil.delete("亿") | pynutil.delete("億") + + # data imported + zero = pynini.string_file(get_abs_path("data/numbers/zero-nano.tsv")) + digits = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) + tens = pynini.string_file(get_abs_path("data/numbers/tens-nano.tsv")) + + # grammar for digits + graph_digits = digits | pynutil.insert("0") + + # grammar for teens + ten = pynini.string_map([("十", "1"), ("拾", "1")]) + graph_teens = ten + graph_digits + graph_teens = graph_teens | pynutil.insert("0") + + # grammar for tens, not the output for Cardinal grammar but for pure Arabic digits (used in other grammars) + graph_tens = (tens + graph_digits) | (pynini.cross(pynini.accep("零"), "0") + graph_digits) + graph_all = graph_tens | pynutil.insert("00") + + # grammar for tens from 20 - 90 which only convert the ones with 3 Mandarin characters + tens_re = pynini.string_file(get_abs_path("data/numbers/tens_re-nano.tsv")) + digits_re = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) + graph_all_re = tens_re + digits_re + + # grammar for hundreds 百 + graph_hundreds_complex = (graph_digits + delete_hundreds + graph_all) | ( + graph_digits + delete_hundreds + pynini.cross(pynini.closure("零"), "0") + graph_digits + ) + graph_hundreds = graph_hundreds_complex + graph_hundreds = graph_hundreds | pynutil.insert("000") + + # grammar for thousands 千 + graph_thousands_simple = graph_digits + closure_thousands + graph_thousands_complex = ( + (graph_digits + delete_thousands + graph_hundreds_complex) + | (graph_digits + delete_thousands + pynini.cross(pynini.closure("零"), "0") + graph_all) + | (graph_digits + delete_thousands + pynini.cross(pynini.closure("零"), "00") + graph_digits) + ) + graph_thousands = graph_thousands_simple | graph_thousands_complex + graph_thousands = graph_thousands | pynutil.insert("000") + + # grammar for ten thousands 万 + graph_ten_thousands_simple = graph_digits + closure_ten_thousands + graph_ten_thousands_complex = ( + (graph_digits + delete_ten_thousands + graph_thousands_complex) + | (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "0") + graph_hundreds_complex) + | (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) + | (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) + ) + graph_ten_thousands = graph_ten_thousands_simple | graph_ten_thousands_complex | pynutil.insert("00000") + + # grammmar for hundred thousands 十万 + graph_hundred_thousands_simple = graph_all + closure_ten_thousands + graph_hundred_thousands_complex = ( + (graph_tens + delete_ten_thousands + graph_thousands_complex) + | (graph_tens + delete_ten_thousands + pynini.cross(pynini.closure("零"), "0") + graph_hundreds_complex) + | (graph_tens + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) + | (graph_tens + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) + ) + graph_hundred_thousands = (graph_hundred_thousands_simple | graph_hundred_thousands_complex) | pynutil.insert( + "000000" + ) + + # grammar for millions 百万 + graph_millions_simple = graph_hundreds_complex + closure_ten_thousands + graph_millions_complex = ( + (graph_hundreds_complex + delete_ten_thousands + graph_thousands_complex) + | ( + graph_hundreds_complex + + delete_ten_thousands + + pynini.cross(pynini.closure("零"), "0") + + graph_hundreds_complex + ) + | (graph_hundreds_complex + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) + | (graph_hundreds_complex + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) + ) + graph_millions = ( + pynutil.add_weight(graph_millions_simple, -1.0) | graph_millions_complex | pynutil.insert("0000000") + ) + + # grammar for ten millions 千万 + graph_ten_millions_simple = graph_thousands_complex + closure_ten_thousands + graph_ten_millions_complex = ( + (graph_thousands_complex + delete_ten_thousands + graph_thousands_complex) + | ( + graph_thousands_complex + + delete_ten_thousands + + pynini.cross(pynini.closure("零"), "0") + + graph_hundreds_complex + ) + | (graph_thousands_complex + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) + | ( + graph_thousands_complex + + delete_ten_thousands + + pynini.cross(pynini.closure("零"), "000") + + graph_digits + ) + ) + graph_ten_millions = pynutil.add_weight(graph_ten_millions_simple, -1.0) | graph_ten_millions_complex + graph_ten_millions = graph_ten_millions | pynutil.insert("00000000") + + # grammar for hundred millions 亿 + graph_hundred_millions_simple = graph_digits + closure_hundred_millions + graph_hundred_millions_complex = ( + (graph_digits + delete_hundred_millions + graph_ten_millions_complex) + | ( + graph_digits + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0") + + graph_millions_complex + ) + | ( + graph_digits + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "00") + + graph_hundred_thousands_complex + ) + | ( + graph_digits + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "000") + + graph_ten_thousands_complex + ) + | ( + graph_digits + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0000") + + graph_thousands_complex + ) + | ( + graph_digits + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "00000") + + graph_hundreds_complex + ) + | (graph_digits + delete_hundred_millions + pynini.cross(pynini.closure("零"), "000000") + graph_all) + | (graph_digits + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0000000") + graph_digits) + ) + graph_hundred_millions = ( + graph_hundred_millions_simple | graph_hundred_millions_complex | pynutil.insert("000000000") + ) + + # grammar for billions 十亿 + graph_billions_simple = graph_all + closure_hundred_millions + graph_billions_complex = ( + (graph_all + delete_hundred_millions + graph_ten_millions_complex) + | (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0") + graph_millions_complex) + | ( + graph_all + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "00") + + graph_hundred_thousands_complex + ) + | ( + graph_all + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "000") + + graph_ten_thousands_complex + ) + | ( + graph_all + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0000") + + graph_thousands_complex + ) + | ( + graph_all + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "00000") + + graph_hundreds_complex + ) + | (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "000000") + graph_all) + | (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0000000") + graph_digits) + ) + graph_billions = graph_billions_simple | graph_billions_complex | pynutil.insert("0000000000") + + # grammar for ten billions 百亿 + graph_ten_billions_simple = graph_hundreds_complex + closure_hundred_millions + graph_ten_billions_complex = ( + (graph_hundreds_complex + delete_hundred_millions + graph_ten_millions_complex) + | ( + graph_hundreds_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0") + + graph_millions_complex + ) + | ( + graph_hundreds_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "00") + + graph_hundred_thousands_complex + ) + | ( + graph_hundreds_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "000") + + graph_ten_thousands_complex + ) + | ( + graph_hundreds_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0000") + + graph_thousands_complex + ) + | ( + graph_hundreds_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "00000") + + graph_hundreds_complex + ) + | ( + graph_hundreds_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "000000") + + graph_all + ) + | ( + graph_hundreds_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0000000") + + graph_digits + ) + ) + graph_ten_billions = graph_ten_billions_simple | graph_ten_billions_complex | pynutil.insert("00000000000") + + # grammar for hundred billions 千亿 + graph_hundred_billions_simple = graph_thousands_complex + closure_hundred_millions + graph_hundred_billions_complex = ( + (graph_thousands_complex + delete_hundred_millions + graph_ten_millions_complex) + | ( + graph_thousands_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0") + + graph_millions_complex + ) + | ( + graph_thousands_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "00") + + graph_hundred_thousands_complex + ) + | ( + graph_thousands_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "000") + + graph_ten_thousands_complex + ) + | ( + graph_thousands_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0000") + + graph_thousands_complex + ) + | ( + graph_thousands_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "00000") + + graph_hundreds_complex + ) + | ( + graph_thousands_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "000000") + + graph_all + ) + | ( + graph_thousands_complex + + delete_hundred_millions + + pynini.cross(pynini.closure("零"), "0000000") + + graph_digits + ) + ) + graph_hundred_billions = graph_hundred_billions_simple | graph_hundred_billions_complex + + # combining grammar; output for cardinal grammar + graph = pynini.union( + graph_hundred_billions, + graph_ten_billions, + graph_billions, + graph_hundred_millions, + graph_ten_millions, + graph_millions, + graph_hundred_thousands, + graph_ten_thousands, + graph_thousands, + graph_hundreds, + graph_all_re, + ) + + # combining grammar; output consists only arabic numbers + graph_just_cardinals = pynini.union( + graph_hundred_billions_complex, + graph_ten_billions_complex, + graph_billions_complex, + graph_hundred_millions_complex, + graph_ten_millions_complex, + graph_millions_complex, + graph_hundred_thousands_complex, + graph_ten_thousands_complex, + graph_thousands_complex, + graph_hundreds_complex, + graph_all, + graph_teens, + graph_digits, + zero, + ) + + # delete unnecessary leading zero + delete_leading_zeros = pynutil.delete(pynini.closure("0")) + stop_at_non_zero = pynini.difference(NEMO_DIGIT, "0") + rest_of_cardinal = pynini.closure(NEMO_DIGIT) | pynini.closure(NEMO_SIGMA) + + # output for cardinal grammar without leading zero + clean_cardinal = delete_leading_zeros + stop_at_non_zero + rest_of_cardinal + clean_cardinal = clean_cardinal | "0" + graph = graph @ clean_cardinal # output for regular cardinals + self.for_ordinals = graph # used for ordinal grammars + + # output for pure arabic number without leading zero + clean_just_cardinal = delete_leading_zeros + stop_at_non_zero + rest_of_cardinal + clean_just_cardinal = clean_just_cardinal | "0" + graph_just_cardinals = graph_just_cardinals @ clean_just_cardinal # output for other grammars + self.just_cardinals = graph_just_cardinals # used for other grammars + + # final grammar for cardinal output; tokenization + optional_minus_graph = (pynini.closure(pynutil.insert("negative: ") + pynini.cross("负", "\"-\""))) | ( + pynini.closure(pynutil.insert("negative: ") + pynini.cross("負", "\"-\"")) + ) + final_graph = optional_minus_graph + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") + final_graph = self.add_tokens(final_graph) + self.fst = final_graph diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py new file mode 100644 index 000000000..c75b1e704 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py @@ -0,0 +1,90 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path +from pynini.lib import pynutil + + +class DateFst(GraphFst): + def __init__(self): + """ + Finite state transducer for classifying date + 1798年五月三十日 -> date { year: "1798" month: "5" day: "30" } + 五月三十日 -> date { month: "5" day: "30" } + 一六七二年 -> date { year: "1672" } + """ + super().__init__(name="date", kind="classify") + + digits = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) # imported for year-component + months = pynini.string_file(get_abs_path("data/date/month-nano.tsv")) # imported for month-component + days = pynini.string_file(get_abs_path("data/date/day-nano.tsv")) # imported for day-component + + # grammar for year + graph_year = ( + pynini.closure(digits) + + pynini.closure(pynini.cross("零", "0")) + + pynini.closure(digits) + + pynini.closure(pynini.cross("零", "0")) + + pynutil.delete("年") + ) + graph_year = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + + # grammar for month + graph_month = pynutil.insert("month: \"") + months + pynutil.delete("月") + pynutil.insert("\"") + + # grammar for day + graph_day_suffix = pynini.accep("日") | pynini.accep("号") | pynini.accep("號") + graph_delete_day_suffix = pynutil.delete(graph_day_suffix) + graph_day = pynutil.insert("day: \"") + days + graph_delete_day_suffix + pynutil.insert("\"") + + # grammar for combinations of year+month, month+day, and year+month+day + graph_ymd = graph_year + pynutil.insert(" ") + graph_month + pynutil.insert(" ") + graph_day + graph_ym = graph_year + pynutil.insert(" ") + graph_month + graph_md = graph_month + pynutil.insert(" ") + graph_day + + # final grammar for standard date + graph_date = graph_ymd | graph_ym | graph_md | graph_year | graph_month | graph_day + # graph_date = graph_year | graph_month | graph_day + + # grammar for optional prefix ad or bc + graph_bc_prefix = pynini.closure("紀元前", 0, 1) | pynini.closure("公元前", 0, 1) | pynini.closure("纪元前", 0, 1) + graph_bc = pynutil.delete(graph_bc_prefix) + + graph_ad_prefix = ( + pynini.closure("公元", 0, 1) + | pynini.closure("公元后", 0, 1) + pynini.closure("紀元", 0, 1) + | pynini.closure("纪元", 0, 1) + | pynini.closure("西元", 0, 1) + ) + graph_ad = pynutil.delete(graph_ad_prefix) + + graph_suffix_bc = ( + graph_bc + graph_date + pynutil.insert(" era: \"") + pynutil.insert("B.C.") + pynutil.insert("\"") + ) + graph_suffix_ad = ( + graph_ad + graph_date + pynutil.insert(" era: \"") + pynutil.insert("A.D.") + pynutil.insert("\"") + ) + + graph_era = graph_suffix_bc | graph_suffix_ad + + # grammar for standard date and with era + graph_date_final = graph_era | graph_date + + # graph_date_final = graph_date + + final_graph = self.add_tokens(graph_date_final) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py new file mode 100644 index 000000000..6ce84b4b6 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py @@ -0,0 +1,111 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path +from pynini.lib import pynutil + + +def get_quantity(decimal, cardinal): + suffix = pynini.union( + "万", + "十万", + "百万", + "千万", + "亿", + "十亿", + "百亿", + "千亿", + "萬", + "十萬", + "百萬", + "千萬", + "億", + "十億", + "百億", + "千億", + "拾萬", + "佰萬", + "仟萬", + "拾億", + "佰億", + "仟億", + "拾万", + "佰万", + "仟万", + "仟亿", + "佰亿", + "仟亿", + ) + numbers = cardinal + res = ( + pynutil.insert("integer_part: \"") + + numbers + + pynutil.insert("\"") + + pynutil.insert(" quantity: \"") + + suffix + + pynutil.insert("\"") + ) + res = res | decimal + pynutil.insert(" quantity: \"") + suffix + pynutil.insert("\"") + + return res + + +class DecimalFst(GraphFst): + def __init__(self, cardinal: GraphFst): + super().__init__(name="decimal", kind="classify") + + cardinal_after_decimal = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) + cardinal_before_decimal = cardinal.just_cardinals | (pynini.closure(pynini.cross("零", "0"), 0, 1)) + + delete_decimal = pynutil.delete("点") | pynutil.delete( + "點" + ) # delete decimal character, 'point' in english in 'one point two for 1.2' + + # grammar for integer part + graph_integer = ( + pynutil.insert("integer_part: \"") + + (cardinal_before_decimal | (pynini.closure(pynini.cross("零", "0"), 0, 1))) + + pynutil.insert("\" ") + ) # tokenization on just numbers + graph_integer_or_none = graph_integer | pynutil.insert("integer_part: \"0\" ", weight=0.01) # integer or zero + + # grammar for fractional part + delete_zero = pynini.closure(pynini.cross("零", "0")) + graph_string_of_cardinals = cardinal_after_decimal + graph_string_of_cardinals = ( + pynini.closure(graph_string_of_cardinals) + delete_zero + pynini.closure(graph_string_of_cardinals) + ) + graph_fractional = pynutil.insert("fractional_part: \"") + graph_string_of_cardinals + pynutil.insert("\"") + + # grammar for decimal: integer+delete character+part after decimal point + graph_decimal_no_sign = graph_integer_or_none + delete_decimal + graph_fractional + + # New Grammar added for Money + self.final_graph_wo_negative = graph_decimal_no_sign | get_quantity( + graph_decimal_no_sign, cardinal.just_cardinals + ) + + graph_negative = pynini.cross("负", "negative: \"-\" ") | pynini.cross("負", "negative: \"-\" ") + graph_negative = pynini.closure(graph_negative, 0, 1) # captures only one "负" + + graph_decimal = graph_negative + graph_decimal_no_sign + graph_decimal = graph_decimal | (graph_negative + get_quantity(graph_decimal_no_sign, cardinal_before_decimal)) + self.final_graph_wo_negative = graph_decimal + + final_graph = self.add_tokens(graph_decimal) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py new file mode 100644 index 000000000..fc3dbeda2 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py @@ -0,0 +1,47 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst +from pynini.lib import pynutil + +class FractionFst(GraphFst): + """ + Finite state transducer for classifying fraction + e.g. 二分之一 -> tokens { fraction { denominator: "2" numerator: "1"} } + e.g. 五又二分之一 -> tokens { fraction { integer_part: "1" denominator: "2" numerator: "1" } } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="fraction", kind="classify") + + graph_cardinal = cardinal.just_cardinals + integer_component = pynutil.insert("integer_part: \"") + graph_cardinal + pynutil.insert("\"") + denominator_component = ( + pynutil.insert("denominator: \"") + graph_cardinal + pynutil.delete("分之") + pynutil.insert("\"") + ) + numerator_component = pynutil.insert("numerator: \"") + graph_cardinal + pynutil.insert("\"") + + graph_only_fraction = denominator_component + pynutil.insert(" ") + numerator_component + graph_fraction_with_int = integer_component + pynutil.delete("又") + pynutil.insert(" ") + graph_only_fraction + + graph_fraction = graph_only_fraction | graph_fraction_with_int + + final_graph = self.add_tokens(graph_fraction) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py new file mode 100644 index 000000000..c2b0f44f2 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py @@ -0,0 +1,130 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini + +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, GraphFst +from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path +from pynini.lib import pynutil + +class MoneyFst(GraphFst): + def __init__(self, cardinal: GraphFst, decimal: GraphFst): + super().__init__(name="money", kind="classify") + + # imports + minor_currency_cent = pynini.string_file(get_abs_path("data/money/currency_rmb_minor_cent-nano.tsv")) + minor_currency_tencent = pynini.string_file(get_abs_path("data/money/currency_rmb_minor_tencent-nano.tsv")) + minor_digit = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) + zero = pynini.string_file(get_abs_path("data/numbers/zero-nano.tsv")) + major_currency = pynini.string_file(get_abs_path("data/money/currency_major-nano.tsv")) # + minor_currency = pynini.string_file(get_abs_path("data/money/currency_minor-nano.tsv")) # + graph_cardinal = cardinal.for_ordinals + graph_decimal = decimal.final_graph_wo_negative # + fraction_integer = minor_digit | zero + + # add leding zero to the number: 1 -> 01 + add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) # + graph_fractional_values = graph_cardinal @ add_leading_zero_to_double_digit # + + # regular number and yuan part + graph_integer_component = pynutil.insert("integer_part: \"") + graph_cardinal + pynutil.insert("\"") + graph_fractional_component = ( + pynutil.insert("fractional_part: \"") + + graph_fractional_values + + pynutil.insert("\"") + + pynutil.delete(minor_currency) + ) + graph_fractional_component_ex = ( + pynutil.insert("fractional_part: \"") + graph_fractional_values + pynutil.insert("\"") + ) + + # regular symbol part + graph_major_currency = pynutil.insert("currency: \"") + major_currency + pynutil.insert("\"") + graph_minor_currency = pynutil.insert("currency: \"") + minor_currency + pynutil.insert("\"") + + # regular combine number and symbol part + graph_only_major = graph_integer_component + pynutil.insert(" ") + graph_major_currency + graph_only_minor = graph_fractional_component_ex + pynutil.insert(" ") + graph_minor_currency + graph_money = graph_only_major + pynutil.insert(" ") + graph_fractional_component + + # regular large money with decimals + graph_large_money = graph_decimal + pynutil.insert(" ") + graph_major_currency + + # final graph for regular currency + graph_regular_money = graph_only_major | graph_only_minor | graph_money | graph_large_money + + # yuan number part + graph_cent_fractional_comp = pynutil.insert("cent_part: \"") + fraction_integer + pynutil.insert("\"") + graph_tencent_fractional_comp = pynutil.insert("tencent_part: \"") + fraction_integer + pynutil.insert("\"") + + # yuan symbol part + # graph_currency_major = pynutil.insert("currency: \"") + major_currency + pynutil.insert("\"") + graph_currency_minor_cent = pynutil.insert("currency: \"") + minor_currency_cent + pynutil.insert("\"") + graph_currency_minor_tencent = pynutil.insert("currency: \"") + minor_currency_tencent + pynutil.insert("\"") + + # yuan combine number and symbol part + # graph_only_major_yuan = graph_integer_component + pynutil.insert(" ") + graph_major_currency# + pynutil.insert(" ") + graph_currency_rmb_token + graph_only_cent = graph_cent_fractional_comp + pynutil.insert(" ") + graph_currency_minor_cent + graph_only_tencent = graph_tencent_fractional_comp + pynutil.insert(" ") + graph_currency_minor_tencent + + # yuan major plus minor + symbols = pynini.union('元', '毛', '角', '分') + delete_symbols = pynutil.delete(symbols) + graph_major_cent = ( + graph_integer_component + + delete_symbols + + pynutil.insert(" ") + + graph_cent_fractional_comp + + pynutil.insert(" ") + + graph_currency_minor_cent + ) + graph_major_tencent = ( + graph_integer_component + + delete_symbols + + pynutil.insert(" ") + + graph_tencent_fractional_comp + + pynutil.insert(" ") + + graph_currency_minor_tencent + ) + graph_tencent_cent = ( + graph_tencent_fractional_comp + + delete_symbols + + pynutil.insert(" ") + + graph_cent_fractional_comp + + pynutil.insert(" ") + + graph_currency_minor_cent + ) + graph_major_minor = ( + graph_integer_component + + delete_symbols + + pynutil.insert(" ") + + graph_tencent_fractional_comp + + pynutil.insert(" ") + + delete_symbols + + graph_cent_fractional_comp + + pynutil.insert(" ") + + graph_currency_minor_cent + ) + + # final graph for yuan + # graph_yuan_only = graph_only_major_yuan | graph_only_cent | graph_only_tencent + graph_yuan_only = graph_only_cent | graph_only_tencent + graph_yuan_comb = graph_major_cent | graph_major_tencent | graph_tencent_cent | graph_major_minor + + # combing both + graph_yuan = graph_yuan_only | graph_yuan_comb + graph_final = graph_regular_money | graph_yuan + final = self.add_tokens(graph_final) + self.fst = final.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py new file mode 100644 index 000000000..576db4206 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py @@ -0,0 +1,33 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini + +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.cardinal import CardinalFst +from pynini.lib import pynutil + + +# A third way, seems to work fine but might have potential issues? +class OrdinalFst(GraphFst): + def __init__(self, cardinal: GraphFst): + super().__init__(name="ordinal", kind="classify") + + graph_cardinals = cardinal.for_ordinals + mandarin_morpheme = pynini.accep("第") + graph_ordinal = mandarin_morpheme + graph_cardinals + graph_ordinal_final = pynutil.insert("integer: \"") + graph_ordinal + pynutil.insert("\"") + graph_ordinal_final = self.add_tokens(graph_ordinal_final) + self.fst = graph_ordinal_final.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py new file mode 100644 index 000000000..f6fc63e48 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py @@ -0,0 +1,35 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst +from pynini.lib import pynutil + + +class PunctuationFst(GraphFst): + """ + Finite state transducer for classifying punctuation + e.g. a, -> tokens { name: "a" } tokens { name: "," } + """ + + def __init__(self): + super().__init__(name="punctuation", kind="classify") + + s = "!#$%&\'()*+,-./:;<=>?@^_`{|}~" + punct = pynini.union(*s) + + graph = pynutil.insert("name: \"") + punct + pynutil.insert("\"") + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py new file mode 100644 index 000000000..9d294b8cd --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py @@ -0,0 +1,154 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini + +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path +from pynini.lib import pynutil + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time + e.g., 五d点 -> time { hours: "5" minutes: "00" } + e.g., 正午 -> time { hours: "12" minutes: "00" } + e.g., 两点一刻 -> time { hours: "2" minutes: "15" } + e.g., 上午九点 -> time { hours: "2" minutes: "00" affix: "a.m." } + e.g., 五点差五分 -> time { hours: "4" minutes: "55"} + """ + + def __init__(self): + super().__init__(name="time", kind="classify") + + # data imported + hours = pynini.string_file(get_abs_path("data/time/time_hours-nano.tsv")) # hours from 1 to 24 + minutes = pynini.string_file(get_abs_path("data/time/time_minutes-nano.tsv")) # minutes from 1 to 60 + hours_to = pynini.string_file(get_abs_path("data/time/hours_to-nano.tsv")) # used for back counting, see below + minutes_to = pynini.string_file( + get_abs_path("data/time/minutes_to-nano.tsv") + ) # used for back counting, see below + + # graph for one quarter (e.g., 10:15) + graph_quarter = pynini.accep("一刻") | pynini.accep("壹刻") | pynini.accep("幺刻") + graph_quarter = pynini.cross(graph_quarter, "15") + + # grammar for two quarters or half (e.g., 10:30) + graph_half = pynini.accep("半").ques + graph_half = pynini.cross(graph_half, "30") + graph_half_alt = pynini.accep("二刻") | pynini.accep("貳刻") | pynini.accep("两刻") | pynini.accep("兩刻") + graph_half_alt = pynini.cross(graph_half_alt, "30") + graph_half = graph_half | graph_half_alt + + # grammar for three quarters (e.g., 10:45) + graph_three_quarter = pynini.accep("三刻", "45") | pynini.accep("叁刻", "45") + graph_three_quarter = pynini.cross(graph_three_quarter, "45") + + # combining grammars quarter, two quater, and three quarter + graph_fractions = graph_quarter | graph_half | graph_three_quarter + + # graph for "Noon-12PM" + graph_noon = pynini.cross("中午", "12") | pynini.cross("正午", "12") | pynini.cross("午间", "12") + graph_midnight = pynini.cross("午夜", "0") | pynini.cross("半夜", "0") | pynini.cross("子夜", "0") + + # graph for hour + graph_delete_hours = ( + pynutil.delete("点") | pynutil.delete("點") | pynutil.delete("时") | pynutil.delete("時") + ) # "点": Mandarin for "hour | o'clock" (e.g.,十点=ten o' clock) + graph_hours = hours + graph_delete_hours + + # graph for minutes + # graph_minutes = pynini.closure(pynutil.delete("分"), 0, 1) + graph_minutes = pynutil.delete('分') + graph_minutes = minutes + graph_minutes + + # add tokenization for hours position component + graph_hours_component = pynini.union(graph_hours, graph_noon, graph_midnight) # what to put at hours-position + graph_hours_component = pynutil.insert("hours: \"") + graph_hours_component + pynutil.insert("\"") + + # add tokenization for minutes position component + graph_minutes_component = pynini.union(graph_minutes, graph_fractions) | pynutil.insert( + "00" + ) # what to put at minutes-position + graph_minutes_component = pynutil.insert(" minutes: \"") + graph_minutes_component + pynutil.insert("\"") + graph_minutes_component = delete_space + graph_minutes_component + + # combine two above to process digit + "hours" + digit " minutes/null" (e.g., 十点五十分/十点五十-> hours: "10" minutes: "50") + graph_time_standard = graph_hours_component + graph_minutes_component + + # combined hours and minutes but with prefix + graph_time_standard_affix = ( + ( + (pynutil.delete("上午") | pynutil.delete("早上")) + + graph_time_standard + + pynutil.insert(" affix: \"") + + pynutil.insert("a.m.") + + pynutil.insert("\"") + ) + ) | ( + ( + (pynutil.delete("下午") | pynutil.delete("晚上")) + + graph_time_standard + + pynutil.insert(" affix: \"") + + pynutil.insert("p.m.") + + pynutil.insert("\"") + ) + ) + + # combined hours and minutes (上午十點五十-> hours: "10" minutes: "50" affix: "a.m.") + graph_time_standard = graph_time_standard | graph_time_standard_affix + + # grammar for back-counting + # converting hours back + graph_hours_to_component = graph_hours | graph_noon | graph_midnight # | graph_hours_count + graph_hours_to_component @= hours_to # hours_to is the string_file data + graph_hours_to_component = pynutil.insert("hours: \"") + graph_hours_to_component + pynutil.insert("\"") + graph_hours_to_component = graph_hours_to_component + + # converting minutes back + graph_minutes_to_component = minutes | graph_half | graph_quarter | graph_three_quarter | graph_half_alt + graph_minutes_to_component @= minutes_to # minutes_to is the string_file data + graph_minutes_to_component = pynutil.insert(" minutes: \"") + graph_minutes_to_component + pynutil.insert("\"") + + graph_delete_back_counting = pynutil.delete("差") | pynutil.delete("还有") | pynutil.delete("還有") + graph_delete_minutes = pynutil.delete("分") | pynutil.delete("分钟") | pynutil.delete("分鐘") + + # adding a.m. and p.m. + graph_time_to = ( + graph_hours_to_component + graph_delete_back_counting + graph_minutes_to_component + graph_delete_minutes + ) + graph_time_to_affix = ( + ( + (pynutil.delete("上午") | pynutil.delete("早上")) + + graph_time_to + + pynutil.insert(" affix: \"") + + pynutil.insert("a.m.") + + pynutil.insert("\"") + ) + ) | ( + ( + (pynutil.delete("下午") | pynutil.delete("晚上")) + + graph_time_to + + pynutil.insert(" prefix: \"") + + pynutil.insert("p.m.") + + pynutil.insert("\"") + ) + ) + graph_time_to = graph_time_to | graph_time_to_affix + + # final grammar + final_graph = graph_time_standard | graph_time_to + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..c41c288ff --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py @@ -0,0 +1,120 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import pynini + +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.inverse_text_normalization.zh.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.date import DateFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.decimal import DecimalFst + +# from nemo_text_processing.inverse_text_normalization.zh.taggers.electronic import ElectronicFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.fraction import FractionFst + +# from nemo_text_processing.inverse_text_normalization.zh.taggers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.money import MoneyFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.punctuation import PunctuationFst + +# from nemo_text_processing.inverse_text_normalization.zh.taggers.telephone import TelephoneFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.time import TimeFst + +# from nemo_text_processing.inverse_text_normalization.zh.taggers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.word import WordFst +from pynini.lib import pynutil + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="tokenize_and_classify", kind="classify") + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, "_zh_itn.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logging.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logging.info(f"Creating ClassifyFst grammars.") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + ordinal = OrdinalFst(cardinal) + ordinal_graph = ordinal.fst + + decimal = DecimalFst(cardinal) + decimal_graph = decimal.fst + + # measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst + # date_graph = DateFst(ordinal=ordinal).fst + date = DateFst() + date_graph = date.fst + word_graph = WordFst().fst + time_graph = TimeFst().fst + money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst + fraction = FractionFst(cardinal) + fraction_graph = fraction.fst + # whitelist_graph = WhiteListFst().fst + punct_graph = PunctuationFst().fst + # electronic_graph = ElectronicFst().fst + # telephone_graph = TelephoneFst(cardinal).fst + + classify = ( + # pynutil.add_weight(whitelist_graph, 1.01) + pynutil.add_weight(time_graph, 1.1) + | pynutil.add_weight(date_graph, 1.09) + | pynutil.add_weight(decimal_graph, 1.2) + # | pynutil.add_weight(measure_graph, 1.1) + | pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(ordinal_graph, 1.1) + | pynutil.add_weight(money_graph, 1.1) + | pynutil.add_weight(fraction_graph, 1.1) + # | pynutil.add_weight(telephone_graph, 1.1) + # | pynutil.add_weight(electronic_graph, 1.1) + | pynutil.add_weight(word_graph, 100) + ) + + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + token_plus_punct = ( + pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + ) + + graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) + graph = delete_space + graph + delete_space + + self.fst = graph.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) + logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/word.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/word.py new file mode 100644 index 000000000..84d1f4e16 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/word.py @@ -0,0 +1,30 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_NOT_SPACE, GraphFst +from pynini.lib import pynutil + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class. + e.g. sleep -> tokens { name: "sleep" } + """ + + def __init__(self): + super().__init__(name="word", kind="classify") + word = pynutil.insert('name: "') + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert('"') + self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/utils.py b/nemo_text_processing/inverse_text_normalization/zh/utils.py new file mode 100644 index 000000000..6a8bd48cd --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/utils.py @@ -0,0 +1,47 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Union + +import inflect + +_inflect = inflect.engine() + + +def num_to_word(x: Union[str, int]): + """ + converts integer to spoken representation + + Args + x: integer + + Returns: spoken representation + """ + if isinstance(x, int): + x = str(x) + x = _inflect.number_to_words(str(x)).replace("-", " ").replace(",", "") + return x + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/__init__.py new file mode 100644 index 000000000..4fc50543f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py new file mode 100644 index 000000000..ebf2f018b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py @@ -0,0 +1,95 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini + +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( + NEMO_DIGIT, + NEMO_SIGMA, + GraphFst, + delete_space, +) +from pynini.lib import pynutil + +class CardinalFst(GraphFst): + def __init__(self): + super().__init__(name="cardinal", kind="verbalize") + + # group numbers by three + exactly_three_digits = NEMO_DIGIT ** 3 + at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) + + suffix = pynini.union( + "千", + "仟", + "万", + "十万", + "百万", + "千万", + "亿", + "十亿", + "百亿", + "千亿", + "萬", + "十萬", + "百萬", + "千萬", + "億", + "十億", + "百億", + "千億", + "拾萬", + "佰萬", + "仟萬", + "拾億", + "佰億", + "仟億", + "拾万", + "佰万", + "仟万", + "仟亿", + "佰亿", + "仟亿", + ) + + # inserting a "," between every 3 numbers + group_by_threes = ( + at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() + ) + pynini.closure(suffix) + + # remove the negative attribute and leaves the sign if occurs + optional_sign = pynini.closure( + pynutil.delete("negative: ") + + delete_space + + pynutil.delete("\"") + + pynini.accep("-") + + pynutil.delete("\"") + + delete_space + ) + + # remove integer aspect + graph = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT, 0, 1) + + pynini.closure(NEMO_SIGMA) + + pynutil.delete("\"") + ) + graph = graph @ group_by_threes + + graph = optional_sign + graph + + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py new file mode 100644 index 000000000..5bb150543 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py @@ -0,0 +1,87 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini + +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from pynini.lib import pynutil + + +class DateFst(GraphFst): + """ + Finite state transducer for verbalizing date, e.g. + date { year: "1798" month: "5" day: "30" } -> 1798年5月30日 + date { year: "1798" month: "5" } -> 1798年5月 + date { month: "5" day: "30" } -> 5月30日 + """ + + def __init__(self): + super().__init__(name="date", kind="verbalize") + + # removing tokenization for year, month and day + year = ( + pynutil.delete("year:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + delete_space + + pynutil.delete("\"") + ) + month = ( + pynutil.delete("month:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + delete_space + + pynutil.delete("\"") + ) + day = ( + pynutil.delete("day:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + delete_space + + pynutil.delete("\"") + ) + era = pynutil.delete("era:") + bc = era + delete_space + pynutil.delete("\"") + pynini.cross("A.D.", "公元") + pynutil.delete("\"") + ad = era + delete_space + pynutil.delete("\"") + pynini.cross("B.C.", "公元前") + pynutil.delete("\"") + + # combining above 3 for variations + graph_ymd = ( + year + + pynutil.insert("年") + + delete_space + + month + + pynutil.insert("月") + + delete_space + + day + + pynutil.insert("日") + ) + graph_ym = year + pynutil.insert("年") + delete_space + month + pynutil.insert("月") + graph_md = month + pynutil.insert("月") + delete_space + day + pynutil.insert("日") + graph_year = year + pynutil.insert("年") + graph_month = month + pynutil.insert("月") + graph_day = day + pynutil.insert("日") + graph_era = bc | ad + + optional_era = pynini.closure(graph_era) + + final_graph = ( + optional_era + delete_space + (graph_ymd | graph_ym | graph_md | graph_year | graph_month | graph_day) + ) + + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py new file mode 100644 index 000000000..4554769db --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py @@ -0,0 +1,92 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini + +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( + NEMO_DIGIT, + NEMO_NOT_QUOTE, + GraphFst, + delete_space, +) +from pynini.lib import pynutil + +class DecimalFst(GraphFst): + def __init__(self): + super().__init__(name="decimal", kind="verbalize") + + # group numbers by three + exactly_three_digits = NEMO_DIGIT ** 3 + at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) + + # insert a "," for every three numbers before decimal point + space_every_three_integer = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() + # insert a "," for every three numbers after decimal point + space_every_three_decimal = ( + pynini.accep(".") + (exactly_three_digits + pynutil.insert(",")).closure() + at_most_three_digits + ) + + # combine both + group_by_threes = space_every_three_integer | space_every_three_decimal + self.group_by_threes = group_by_threes + + # removing tokenizations, 'negative: ' + optional_sign = pynini.closure( + pynutil.delete("negative: ") + + delete_space + + pynutil.delete("\"") + + pynini.accep("-") + + pynutil.delete("\"") + + delete_space + ) + + # removing tokenzations, 'integer_part:' + integer = ( + pynutil.delete("integer_part:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + integer = integer @ group_by_threes + optional_integer = pynini.closure(integer + delete_space, 0, 1) + + # removing tokenizations, 'fractionl_part' + fractional = ( + pynutil.insert(".") + + pynutil.delete("fractional_part:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + fractional = fractional @ group_by_threes + optional_fractional = pynini.closure(fractional + delete_space, 0, 1) + + # removing tokenization, 'quantity:' + quantity = ( + pynutil.delete("quantity:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + optional_quantity = pynini.closure(quantity + delete_space) + + # combining graphs removing tokenizations *3 + graph = (optional_integer + optional_fractional + optional_quantity).optimize() + graph = optional_sign + graph # add optional sign for negative number + self.numebrs = graph + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py new file mode 100644 index 000000000..5f8cd9d99 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py @@ -0,0 +1,62 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini + +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, GraphFst, delete_space +from pynini.lib import pynutil + + +class FractionFst(GraphFst): + """ + Finite state transducer for verbalizing fraction + e.g. tokens { fraction { denominator: "2" numerator: "1"} } -> 1/2 + e.g. tokens { fraction { integer_part: "1" denominator: "2" numerator: "1" } } -> 1又1/2 + """ + + def __init__(self): + super().__init__(name="fraction", kind="verbalize") + + integer_part = ( + pynutil.delete("integer_part:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT) + + pynutil.insert("又") + + pynutil.delete("\"") + ) + denominator_part = ( + pynutil.delete("denominator:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT) + + pynutil.delete("\"") + ) + numerator_part = ( + pynutil.delete("numerator:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT) + + pynutil.insert("/") + + pynutil.delete("\"") + ) + + graph_with_integer = integer_part + delete_space + numerator_part + delete_space + denominator_part + graph_no_integer = numerator_part + delete_space + denominator_part + + final_graph = graph_with_integer | graph_no_integer + + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py new file mode 100644 index 000000000..4f439a53e --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py @@ -0,0 +1,70 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini + +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from pynini.lib import pynutil + + +class MoneyFst(GraphFst): + def __init__(self): + super().__init__(name="money", kind="verbalize") + + currency_unit = pynutil.delete("currency: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + number_unit = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + fraction_unit = ( + pynutil.delete("fractional_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) + cent_unit = pynutil.delete("cent_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + tencent_unit = pynutil.delete("tencent_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + decimal_unit = ( + pynutil.insert(".") + + pynutil.delete("fractional_part: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + + delete_space + + pynutil.delete("quantity: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + # regular money part + graph_money_regular = ( + currency_unit + delete_space + number_unit + delete_space + pynutil.insert(".") + fraction_unit + ) + graph_only_major_regular = currency_unit + delete_space + number_unit + graph_only_minor_regular = currency_unit + delete_space + pynutil.insert("0.") + fraction_unit + graph_large_money = currency_unit + delete_space + number_unit + delete_space + decimal_unit + + graph_regular = graph_money_regular | graph_only_major_regular | graph_only_minor_regular | graph_large_money + + # yuan part + graph_money_yuan = ( + currency_unit + + delete_space + + number_unit + + delete_space + + pynutil.insert(".") + + ((pynutil.insert("0") + cent_unit) | (tencent_unit) | (tencent_unit + delete_space + cent_unit)) + ) + graph_yuan_minors = ( + currency_unit + delete_space + pynutil.insert("0.") + tencent_unit + delete_space + cent_unit + ) + graph_yuan = graph_money_yuan | graph_yuan_minors + + graph_verbalizer = graph_regular | graph_yuan + + delete_tokens = self.delete_tokens(graph_verbalizer) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py new file mode 100644 index 000000000..64c120a9e --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py @@ -0,0 +1,40 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini + +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( + NEMO_DIGIT, + NEMO_SIGMA, + GraphFst, + delete_space, +) +from pynini.lib import pynutil + +class OrdinalFst(GraphFst): + def __init__(self): + super().__init__(name="ordinal", kind="verbalize") + graph_integer = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + pynini.accep("第") + + pynini.closure(NEMO_DIGIT) + + pynini.closure(NEMO_SIGMA) + + pynutil.delete("\"") + ) + + delete_tokens = self.delete_tokens(graph_integer) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py new file mode 100644 index 000000000..0537957a3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py @@ -0,0 +1,84 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini + +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, GraphFst, delete_space +from pynini.lib import pynutil + + +class TimeFst(GraphFst): + """ + Finite state transcucer for verbalizing time, e.g., + time { hours: "12" minutes: "30" } -> 12:30 + time { hours: "1" minutes: "30" } -> 01:30 + time { hours: "1" minutes: "30" affix: "a.m." } -> 01:30 a.m. + """ + + def __init__(self): + super().__init__(name="time", kind="verbalize") + add_leading_zero = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) + token_hour = ( + pynutil.delete("hours:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT, 1, 2) + + pynutil.delete("\"") + ) + token_minute = ( + pynutil.delete("minutes:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT, 1, 2) + + pynutil.delete("\"") + ) + + affix_am = ( + delete_space + + pynutil.delete("affix:") + + delete_space + + pynutil.delete("\"") + + pynini.accep("a.m.") + + pynutil.delete("\"") + ) + affix_am = pynutil.insert(" ") + pynini.closure(affix_am, 0, 1) + graph_am = token_hour @ add_leading_zero + delete_space + pynutil.insert(":") + token_minute + graph_am_affix = token_hour @ add_leading_zero + delete_space + pynutil.insert(":") + token_minute + affix_am + graph_am = graph_am | graph_am_affix + + # 5:00 p.m. -> 17:00 or keep 17:00 as 17:00 + affix_pm = ( + delete_space + + pynutil.delete("affix:") + + delete_space + + pynutil.delete("\"") + + pynini.accep("p.m.") + + pynutil.delete("\"") + ) + optional_affix_pm = pynutil.insert(" ") + pynini.closure(affix_pm, 0, 1) + graph_pm = token_hour @ add_leading_zero + delete_space + pynutil.insert(":") + token_minute + graph_pm_affix = ( + token_hour @ add_leading_zero + + delete_space + + pynutil.insert(":") + + token_minute + + pynutil.insert(" ") + + affix_pm + ) + graph_pm = graph_pm | graph_pm_affix + + final_graph = graph_am | graph_pm + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py new file mode 100644 index 000000000..e0b04eb38 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py @@ -0,0 +1,70 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# from nemo_text_processing.inverse_text_normalization.zh.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.date import DateFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.decimal import DecimalFst + +# from nemo_text_processing.inverse_text_normalization.zh.verbalizers.electronic import ElectronicFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.fraction import FractionFst + +# from nemo_text_processing.inverse_text_normalization.zh.verbalizers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.money import MoneyFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.ordinal import OrdinalFst + +# from nemo_text_processing.inverse_text_normalization.zh.verbalizers.telephone import TelephoneFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.time import TimeFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + """ + + def __init__(self): + super().__init__(name="verbalize", kind="verbalize") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + ordinal_graph = OrdinalFst().fst + decimal = DecimalFst() + decimal_graph = decimal.fst + fraction = FractionFst() + fraction_graph = fraction.fst + # measure_graph = MeasureFst(decimal=decimal, cardinal=cardinal).fst + money = MoneyFst() + money_graph = money.fst + time_graph = TimeFst().fst + date_graph = DateFst().fst + # whitelist_graph = WhiteListFst().fst + # telephone_graph = TelephoneFst().fst + # electronic_graph = ElectronicFst().fst + graph = ( + time_graph + | date_graph + | money_graph + | fraction_graph + # | measure_graph + | ordinal_graph + | decimal_graph + | cardinal_graph + # | whitelist_graph + # | telephone_graph + # | electronic_graph + ) + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py new file mode 100644 index 000000000..472da2e5a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py @@ -0,0 +1,44 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst, delete_extra_space, delete_space +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.word import WordFst +from pynini.lib import pynutil + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now + """ + + def __init__(self): + super().__init__(name="verbalize_final", kind="verbalize") + verbalize = VerbalizeFst().fst + word = WordFst().fst + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py new file mode 100644 index 000000000..1e8654a2a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py @@ -0,0 +1,38 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( + NEMO_CHAR, + NEMO_SIGMA, + GraphFst, + delete_space, +) +from pynini.lib import pynutil + + +class WordFst(GraphFst): + """ + Finite state transducer for verbalizing plain tokens + e.g. tokens { name: "sleep" } -> sleep + """ + + def __init__(self): + super().__init__(name="word", kind="verbalize") + chars = pynini.closure(NEMO_CHAR - " ", 1) + char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + + self.fst = graph.optimize() diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..636d0e8ad --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,130 @@ +一百~100 +一百零一~101 +一百一十一~111 +两百~200 +九百~900 +九百五十~950 +九百五十一~951 +一千~1千 +一千零一~1,001 +一千一百~1,100 +一千一百零一~1,101 +一千零五十~1,050 +一千一百一十~1,110 +一千一百十~1,110 +一千一百一十一~1,111 +两千~2千 +九千九百九十九~9,999 +一万一千~11,000 +一万一千一百~11,100 +一万一千一百一十~11,110 +一万一千一百一十一~11,111 +一万零一百~10,100 +一万零一百五十~10,150 +一万零一百五十一~10,151 +一万零一~10,001 +一万零五十~10,050 +一万零五十一~10,051 +一万~1万 +两万~2万 +三万~3万 +四万~4万 +五万~5万 +六万~6万 +七万~7万 +八万~8万 +九万~9万 +十万~10万 +十萬~10萬 +九十万~90万 +九十一万~91万 +九十万五千八百二十五~905,825 +九十一万五千八百二十五~915,825 +十一万~11万 +十万一千一百一十一~101,111 +十万一千一百~101,100 +十万一千~101,000 +十万零一百~100,100 +十万零十~100,010 +十万零一~100,001 +一百万~100万 +一百一十万~110万 +一百一十一万~111万 +两百万~200万 +两百一十万~210万 +两百零一万~201万 +一百一十九万~119万 +一百一十九万九千~1,199,000 +一百一十九万九千九百~1,199,900 +一百一十九万九千九百九十~1,199,990 +一百一十九万九千九百九十九~1,199,999 +一百一十九万零九~1,190,009 +一百一十九万零九百九十一~1,190,991 +一千万~1,000万 +一千一百万~1,100万 +一千一百一十万~1,110万 +一千一百一十一万~1,111万 +一千一百一十一万九千~11,119,000 +一千一百一十一万九千一百~11,119,100 +一千一百一十一万九千一百二十~11,119,120 +一千一百一十一万九千一百二十一~11,119,121 +一千一百一十一万零一~11,110,001 +一千一百一十一万零一十~11,110,010 +一千一百一十一万零一百~11,110,100 +一千零一十万零一百~10,100,100 +一千零一十一万零一百~10,110,100 +一千零一万零一百~10,010,100 +一億~1億 +一億一千萬~110,000,000 +一億一千一百萬~111,000,000 +一億一千一百一十萬~111,100,000 +一億一千一百一十一萬~111,110,000 +一億零一百萬~101,000,000 +一億零一百一十萬~101,100,000 +一億零一百一十一萬~101,110,000 +一億零一十萬~100,100,000 +一億零一十一萬~100,110,000 +一億零一萬~100,010,000 +一億零一萬一千~100,011,000 +一億零一萬一千一百~100,011,100 +一億零一萬一千一百一~100,011,101 +一億零一萬一千一百一十一~100,011,111 +一億零一萬一千一百零五~100,011,105 +一億零一萬一千零五~100,011,005 +十億~10億 +十一億~11億 +十一億九千萬~1,190,000,000 +十一億九千一百萬~1,191,000,000 +十一億九千一百一十萬~1,191,100,000 +十一億九千一百一十一萬~1,191,110,000 +十一億零一百一十萬~1,101,100,000 +十一億零一十萬~1,100,100,000 +十一億零一萬~1,100,010,000 +十一億零十萬~1,100,100,000 +十一億零九千~1,100,009,000 +十一億零九百~1,100,000,900 +十一億零九十~1,100,000,090 +十一億零九~1,100,000,009 +一百億~100億 +一百一十億~110億 +一百一十一億~111億 +一百一十一億九千萬~11,190,000,000 +一百一十一億九千九百萬~11,199,000,000 +一百一十一億九千九百一十萬~11,199,100,000 +一百一十一億九千九百一十一萬~11,199,110,000 +一百一十一億九千九百一十一萬九千~11,199,119,000 +一百一十一億九千九百一十一萬九千九百一十一~11,199,119,911 +一百零一億~101億 +一百零一億零九百萬~10,109,000,000 +一百零一億零九十萬~10,100,900,000 +一百零一億零九萬~10,100,090,000 +一百零一億零九萬零一百~10,100,090,100 +一千億~1,000億 +一千一百億~1,100億 +一千零五十億~1,050億 +一千零五億~1,005億 +一千億九千萬~100,090,000,000 +一千億零九百萬~100,009,000,000 +一千億零九十萬~100,000,900,000 +一千億零九萬~100,000,090,000 +一千億零九十萬零五百~100,000,900,500 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..5404b8ec3 --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_date.txt @@ -0,0 +1,31 @@ +一七九八年五月三十日~1798年5月30日 +五月三十日~5月30日 +一七九八年五月~1798年5月 +八月~8月 +一七九八年~1798年 +十九日~19日 +一九九四年一月二日~1994年1月2日 +一九九五年二月三日~1995年2月3日 +二零零零年三月五日~2000年3月5日 +二零零一年四月六日~2001年4月6日 +公元一七九八年五月三十日~公元1798年5月30日 +公元一八三五年~公元1835年 +公元一八三四年八月~公元1834年8月 +公元一九九四年一月二日~公元1994年1月2日 +公元一九九五年二月三日~公元1995年2月3日 +公元二零零零年三月五日~公元2000年3月5日 +公元二零零一年四月六日~公元2001年4月6日 +公元前一七九八年~公元前1798年 +公元前二八零九年~公元前2809年 +公元前一九九四年一月二日~公元前1994年1月2日 +公元前一九九五年二月三日~公元前1995年2月3日 +公元前二零零零年三月五日~公元前2000年3月5日 +公元前二零零一年四月六日~公元前2001年4月6日 +纪元前一九三四年一月二日~公元前1934年1月2日 +纪元前一九九八年三月三日~公元前1998年3月3日 +纪元前二零零零年三月五日~公元前2000年3月5日 +纪元前二零零一年四月六日~公元前2001年4月6日 +纪元一二三四年一月二日~公元1234年1月2日 +纪元二零五六年二月三日~公元2056年2月3日 +纪元二零零零年三月五日~公元2000年3月5日 +纪元二零零一年四月六日~公元2001年4月6日 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_decimal.txt new file mode 100644 index 000000000..5672ea60a --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_decimal.txt @@ -0,0 +1,42 @@ +一点零~1.0 +十五点零~15.0 +一百点零~100.0 +一百零一点五~101.5 +一点零五六~1.056 +一点零零五六~1.005,6 +一点零零零五六~1.000,56 +两百点一~200.1 +三千点五~3,000.5 +四万点六~40,000.6 +一點零零五~1.005 +九十九點零零零五~99.000,5 +一百點五七三五~100.573,5 +一千五百点零一~1,500.01 +负五万点二四五~-50,000.245 +负十五万点三七九~-150,000.379 +负一点一~-1.1 +负十点五~-10.5 +負十點五~-10.5 +負九十九點九五~-99.95 +負一百五十點一二~-150.12 +負一千五百零九點五一~-1,509.51 +負五萬點三~-50,000.3 +負五點零一~-5.01 +負十點零零一~-10.001 +負十點零零零三~-10.000,3 +負一百點零零零零四~-100.000,04 +一点一二三四五六七八九~1.123,456,789 +负五点一零二~-5.102 +负三点一二零三~-3.120,3 +负十点一二三零五~-10.123,05 +伍拾壹点肆~51.4 +壹佰点叁肆~100.34 +贰拾点伍陆~20.56 +柒拾捌点玖~78.9 +负叁拾壹点肆~-31.4 +负壹佰点叁肆~-100.34 +负贰拾点伍陆~-20.56 +负柒拾点玖~-70.9 +負贰拾点叁肆~-20.34 +負玖点玖~-9.9 +負壹佰贰拾点叁肆~-120.34 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_fraction.txt new file mode 100644 index 000000000..473f1dfb9 --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_fraction.txt @@ -0,0 +1,20 @@ +五分之一~1/5 +二分之一~1/2 +三分之一~1/3 +十分之一~1/10 +一百分之一~1/100 +一千分之一~1/1000 +五分之二~2/5 +三分之二~2/3 +十分之五~5/10 +一千分之五~5/1000 +三又五分之一~3又1/5 +一又二分之一~1又1/2 +一又三分之一~1又1/3 +三又十分之一~3又1/10 +五十又一百分之一~50又1/100 +三又一千分之五~3又5/1000 +六又十分之五~6又5/10 +八又七分之五~8又5/7 +九又四分之三~9又3/4 +五分之四~4/5 diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_money.txt new file mode 100644 index 000000000..2d1311597 --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_money.txt @@ -0,0 +1,49 @@ +一千美元~US$1千 +五千美元~US$5千 +一万美元~US$1万 +一点五万美元~US$1.5万 +五十万美元~US$50万 +一百万美元~US$100万 +一千万美元~US$1000万 +一千元~¥1千 +五千元~¥5千 +一万元~¥1万 +一千五万元~¥1005万 +五十万元~¥50万 +一百万元~¥100万 +一千万元~¥1000万 +一千欧元~€1千 +五千欧元~€5千 +一万欧元~€1万 +一点五万欧元~€1.5万 +五十万欧元~€50万 +一百万欧元~€100万 +一千万欧元~€1000万 +一千英镑~£1千 +五千英镑~£5千 +一万英镑~£1万 +一点五万英镑~£1.5万 +五十万英镑~£50万 +一百万英镑~£100万 +一千万英镑~£1000万 +一千韩元~₩1千 +五千韩元~₩5千 +一万韩元~₩1万 +一点五万韩元~₩1.5万 +五十万韩元~₩50万 +一百万韩元~₩100万 +一千万韩元~₩1000万 +一千印度卢布~₹1千 +五千印度卢布~₹5千 +一万印度卢布~₹1万 +一点五万印度卢布~₹1.5万 +五十万印度卢布~₹50万 +一百万印度卢布~₹100万 +一千万印度卢布~₹1000万 +一千日元~JPY¥1千 +五千日元~JPY¥5千 +一万日元~JPY¥1万 +一点五万日元~JPY¥1.5万 +五十万日元~JPY¥50万 +一百万日元~JPY¥100万 +一千万日元~JPY¥1000万 diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_ordinal.txt new file mode 100644 index 000000000..d6e15f1dd --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_ordinal.txt @@ -0,0 +1,57 @@ +第一百~第100 +第五百~第500 +第兩萬一千一百一十一~第21111 +第一百~第100 +第二百~第200 +第兩千~第2千 +第两万~第2万 +第十万~第10万 +第一百万~第100万 +第一千万~第1000万 +第一亿~第1亿 +第一百零一~第101 +第十亿~第10亿 +第五十万~第50万 +第一百一十一~第111 +第十万一千一百一十一~第101111 +第十万一千一百~第101100 +第十万一千~第101000 +第十万零一百~第100100 +第十万零十~第100010 +第十万零一~第100001 +第一百万~第100万 +第一百一十万~第110万 +第一百一十一万~第111万 +第两百万~第200万 +第两百一十万~第210万 +第两百零一万~第201万 +第一百一十九万~第119万 +第一百一十九万九千~第1199000 +第一百一十九万九千九百~第1199900 +第一百一十九万九千九百九十~第1199990 +第一百一十九万九千九百九十九~第1199999 +第一百一十九万零九~第1190009 +第一百一十九万零九十~第1190090 +第一百一十九万零九十一~第1190091 +第一百一十九万零九百九十一~第1190991 +第一千万~第1000万 +第一千一百万~第1100万 +第一千一百一十万~第1110万 +第一千一百一十一万~第1111万 +第一千一百一十一万九千~第11119000 +第一千一百一十一万九千一百~第11119100 +第一千一百一十一万九千一百二十~第11119120 +第一千一百一十一万九千一百二十一~第11119121 +第一千一百一十一万零一~第11110001 +第一千一百一十一万零一十~第11110010 +第一千一百一十一万零一百~第11110100 +第一千零一十万零一百~第10100100 +第一千零一十一万零一百~第10110100 +第一千零一万零一百~第10010100 +第一億~第1億 +第一億一千萬~第110000000 +第一億一千一百萬~第111000000 +第一億一千一百一十萬~第111100000 +第一億一千一百一十一萬~第111110000 +第一億零一百萬~第101000000 +第一億零一百一十萬~第101100000 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..928f83063 --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_time.txt @@ -0,0 +1,22 @@ +五点五分~05:05 +五点半~05:30 +五点一刻~05:15 +两点二刻~02:30 +三点三刻~03:45 +六点~06:00 +上午五点五分~05:05 a.m. +上午五点半~05:30 a.m. +上午五点一刻~05:15 a.m. +上午两点二刻~02:30 a.m. +上午三点三刻~03:45 a.m. +下午五点五分~05:05 p.m. +下午五点半~05:30 p.m. +下午两点一刻~02:15 p.m. +下午三点二刻~03:30 p.m. +下午四点~04:00 p.m. +正午~12:00 +半夜~00:00 +三点差五分~02:55 +两点差三分~01:57 +三点差四分~02:56 +四点差五分~03:55 \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_word.txt new file mode 100644 index 000000000..66f3445a0 --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_word.txt @@ -0,0 +1,49 @@ +~ +yahoo!~yahoo! +vingt!~20 ! +x ~x +—~— +aaa~aaa +aabach~aabach +aabenraa~aabenraa +aabye~aabye +aaccessed~aaccessed +aach~aach +aachen's~aachen's +aadri~aadri +aafia~aafia +aagaard~aagaard +aagadu~aagadu +aagard~aagard +aagathadi~aagathadi +aaghart's~aaghart's +aagnes~aagnes +aagomoni~aagomoni +aagon~aagon +aagoo~aagoo +aagot~aagot +aahar~aahar +aahh~aahh +aahperd~aahperd +aaibinterstate~aaibinterstate +aajab~aajab +aakasa~aakasa +aakervik~aakervik +aakirkeby~aakirkeby +aalam~aalam +aalbaek~aalbaek +aaldiu~aaldiu +aalem~aalem +a'ali~a'ali +aalilaassamthey~aalilaassamthey +aalin~aalin +aaliyan~aaliyan +aaliyan's~aaliyan's +aamadu~aamadu +aamara~aamara +aambala~aambala +aamera~aamera +aamer's~aamer's +aamina~aamina +aaminah~aaminah +aamjiwnaang~aamjiwnaang diff --git a/tests/nemo_text_processing/zh/test_cardinal.py b/tests/nemo_text_processing/zh/test_cardinal.py new file mode 100644 index 000000000..17d9e2dc0 --- /dev/null +++ b/tests/nemo_text_processing/zh/test_cardinal.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_date.py b/tests/nemo_text_processing/zh/test_date.py index d8079e3a6..d55e1d2a6 100644 --- a/tests/nemo_text_processing/zh/test_date.py +++ b/tests/nemo_text_processing/zh/test_date.py @@ -14,6 +14,8 @@ import pytest from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file @@ -28,3 +30,12 @@ class TestDate: def test_norm_date(self, test_input, expected): preds = self.normalizer_zh.normalize(test_input) assert expected == preds + + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_decimal.py b/tests/nemo_text_processing/zh/test_decimal.py new file mode 100644 index 000000000..f8f73785c --- /dev/null +++ b/tests/nemo_text_processing/zh/test_decimal.py @@ -0,0 +1,30 @@ +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDecimal: + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_fraction.py b/tests/nemo_text_processing/zh/test_fraction.py index 03b508b21..d396dd1c8 100644 --- a/tests/nemo_text_processing/zh/test_fraction.py +++ b/tests/nemo_text_processing/zh/test_fraction.py @@ -13,6 +13,7 @@ # limitations under the License. import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized @@ -28,3 +29,11 @@ class TestFraction: def test_norm_fraction(self, test_input, expected): preds = self.normalizer_zh.normalize(test_input) assert expected == preds + + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_fraction.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/test_money.py b/tests/nemo_text_processing/zh/test_money.py index d06a2b812..5ab059087 100644 --- a/tests/nemo_text_processing/zh/test_money.py +++ b/tests/nemo_text_processing/zh/test_money.py @@ -13,6 +13,7 @@ # limitations under the License. import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized @@ -28,3 +29,13 @@ class TestMoney: def test_norm_money(self, test_input, expected): preds = self.normalizer_zh.normalize(test_input) assert expected == preds + + + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_money.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/test_ordinal.py b/tests/nemo_text_processing/zh/test_ordinal.py new file mode 100644 index 000000000..9775d5522 --- /dev/null +++ b/tests/nemo_text_processing/zh/test_ordinal.py @@ -0,0 +1,30 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestOrdinal: + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_ordinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh new file mode 100644 index 000000000..4ca12af7f --- /dev/null +++ b/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh @@ -0,0 +1,84 @@ +#! /bin/sh + +PROJECT_DIR=/workspace/tests + +runtest () { + input=$1 + cd /workspace/sparrowhawk/documentation/grammars + + # read test file + while read testcase; do + IFS='~' read spoken written <<< $testcase + denorm_pred=$(echo $spoken | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + + # trim white space + written="$(echo -e "${written}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$spoken" "$written" "$denorm_pred" + done < "$input" +} + +testITNCardinal() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testITNDate() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_date.txt + runtest $input +} + +testITNDecimal() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_decimal.txt + runtest $input +} + +testITNOrdinal() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_ordinal.txt + runtest $input +} + +testITNFraction() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_fraction.txt + runtest $input +} + +testITNTime() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_time.txt + runtest $input +} + +testITNMeasure() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_measure.txt + runtest $input +} + +testITNMoney() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_money.txt + runtest $input +} + +testITNWhitelist() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_whitelist.txt + runtest $input +} + +testITNTelephone() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_telephone.txt + runtest $input +} + +testITNElectronic() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_electronic.txt + runtest $input +} + +testITNWord() { + input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_word.txt + runtest $input +} + +# Load shUnit2 +. $PROJECT_DIR/../shunit2/shunit2 diff --git a/tests/nemo_text_processing/zh/test_time.py b/tests/nemo_text_processing/zh/test_time.py index d36737afb..4b30efb99 100644 --- a/tests/nemo_text_processing/zh/test_time.py +++ b/tests/nemo_text_processing/zh/test_time.py @@ -13,6 +13,7 @@ # limitations under the License. import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized @@ -28,3 +29,12 @@ class TestTime: def test_norm_time(self, test_input, expected): preds = self.normalizer_zh.normalize(test_input) assert expected == preds + + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected \ No newline at end of file diff --git a/tests/nemo_text_processing/zh/test_whitelist.py b/tests/nemo_text_processing/zh/test_whitelist.py new file mode 100644 index 000000000..075584b3b --- /dev/null +++ b/tests/nemo_text_processing/zh/test_whitelist.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWhitelist: + inverse_normalizer = InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_word.py b/tests/nemo_text_processing/zh/test_word.py new file mode 100644 index 000000000..46057117f --- /dev/null +++ b/tests/nemo_text_processing/zh/test_word.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWord: + inverse_normalizer = InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected From 1b34ceb503da95ae1ff650dcf69da1d65825aae6 Mon Sep 17 00:00:00 2001 From: Anand Joseph Date: Thu, 9 Feb 2023 21:10:32 +0530 Subject: [PATCH 02/89] Fix copyrights and code cleanup Signed-off-by: Anand Joseph --- .../inverse_normalize.py | 5 +++ .../inverse_text_normalization/zh/__init__.py | 2 +- .../zh/clean_eval_data.py | 2 +- .../zh/data/__init__.py | 2 +- .../zh/data/date/__init__.py | 2 +- .../zh/data/money/__init__.py | 2 +- .../zh/data/numbers/__init__.py | 2 +- .../zh/data/time/__init__.py | 2 +- .../zh/graph_utils.py | 45 +------------------ .../zh/taggers/__init__.py | 2 +- .../zh/taggers/cardinal.py | 3 +- .../zh/taggers/date.py | 3 +- .../zh/taggers/decimal.py | 3 +- .../zh/taggers/fraction.py | 3 +- .../zh/taggers/money.py | 6 +-- .../zh/taggers/ordinal.py | 3 +- .../zh/taggers/punctuation.py | 3 +- .../zh/taggers/time.py | 4 +- .../zh/taggers/tokenize_and_classify.py | 18 ++------ .../zh/taggers/word.py | 3 +- .../inverse_text_normalization/zh/utils.py | 2 +- .../zh/verbalizers/__init__.py | 2 +- .../zh/verbalizers/cardinal.py | 2 +- .../zh/verbalizers/date.py | 3 +- .../zh/verbalizers/decimal.py | 2 +- .../zh/verbalizers/fraction.py | 3 +- .../zh/verbalizers/money.py | 2 +- .../zh/verbalizers/ordinal.py | 3 +- .../zh/verbalizers/time.py | 3 +- .../zh/verbalizers/verbalize.py | 12 +---- .../zh/verbalizers/verbalize_final.py | 3 +- .../zh/verbalizers/word.py | 3 +- .../nemo_text_processing/zh/test_cardinal.py | 2 +- tests/nemo_text_processing/zh/test_date.py | 2 +- tests/nemo_text_processing/zh/test_decimal.py | 2 +- .../nemo_text_processing/zh/test_whitelist.py | 31 ------------- tests/nemo_text_processing/zh/test_word.py | 6 +-- 37 files changed, 46 insertions(+), 152 deletions(-) delete mode 100644 tests/nemo_text_processing/zh/test_whitelist.py diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index 4e5d44de7..d11c92605 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -90,6 +90,11 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ar.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) + elif lang == 'zh': # Arabic + from nemo_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.inverse_text_normalization.zh.verbalizers.verbalize_final import ( + VerbalizeFinalFst, + ) self.tagger = ClassifyFst(cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache) self.verbalizer = VerbalizeFinalFst() diff --git a/nemo_text_processing/inverse_text_normalization/zh/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/__init__.py index c07b8e4c2..4a73fa2f5 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/zh/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/zh/clean_eval_data.py index d9bc2fccb..2d3916560 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/clean_eval_data.py +++ b/nemo_text_processing/inverse_text_normalization/zh/clean_eval_data.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/data/__init__.py index 4fc50543f..6ebc808fa 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/data/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/zh/data/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/date/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/data/date/__init__.py index 4fc50543f..6ebc808fa 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/data/date/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/zh/data/date/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/money/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/data/money/__init__.py index 4fc50543f..6ebc808fa 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/data/money/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/zh/data/money/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/__init__.py index bc443be41..6ebc808fa 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/data/time/__init__.py index 4fc50543f..6ebc808fa 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/data/time/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py index dfef9dc9b..e1ded9235 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -53,25 +53,6 @@ | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"")) ) -# suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) -# _v = pynini.union("a", "e", "i", "o", "u") -# _c = pynini.union( -# "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z" -# ) -# _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") -# _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") -# _s = NEMO_SIGMA + pynutil.insert("s") - -# graph_plural = plurals._priority_union( -# plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA -# ).optimize() - -# SINGULAR_TO_PLURAL = graph_plural -# PLURAL_TO_SINGULAR = pynini.invert(graph_plural) -# TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]) -# TO_UPPER = pynini.invert(TO_LOWER) -# MIN_NEG_WEIGHT = -0.0001 -# MIN_POS_WEIGHT = 0.0001 def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): @@ -89,30 +70,6 @@ def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): print(f'Created {file_name}') -# def get_plurals(fst): -# """ -# Given singular returns plurals# -# -# Args: -# fst: Fst# -# -# Returns plurals to given singular forms -# """ -# return SINGULAR_TO_PLURAL @ fst - - -# def get_singulars(fst): -# """ -# Given plural returns singulars# -# -# Args: -# fst: Fst# -# -# Returns singulars to given plural forms -# """ -# return PLURAL_TO_SINGULAR @ fst - - def convert_space(fst) -> 'pynini.FstLike': """ Converts space to nonbreaking space. diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/__init__.py index 4fc50543f..6ebc808fa 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index d21faa491..5912bb3bb 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -1,5 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py index c75b1e704..190b49b4e 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py @@ -1,5 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py index 6ce84b4b6..bd422d70e 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py @@ -1,5 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py index fc3dbeda2..f82fb510c 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py @@ -1,5 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py index c2b0f44f2..334b33ebe 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py @@ -1,5 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -70,12 +69,10 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): graph_tencent_fractional_comp = pynutil.insert("tencent_part: \"") + fraction_integer + pynutil.insert("\"") # yuan symbol part - # graph_currency_major = pynutil.insert("currency: \"") + major_currency + pynutil.insert("\"") graph_currency_minor_cent = pynutil.insert("currency: \"") + minor_currency_cent + pynutil.insert("\"") graph_currency_minor_tencent = pynutil.insert("currency: \"") + minor_currency_tencent + pynutil.insert("\"") # yuan combine number and symbol part - # graph_only_major_yuan = graph_integer_component + pynutil.insert(" ") + graph_major_currency# + pynutil.insert(" ") + graph_currency_rmb_token graph_only_cent = graph_cent_fractional_comp + pynutil.insert(" ") + graph_currency_minor_cent graph_only_tencent = graph_tencent_fractional_comp + pynutil.insert(" ") + graph_currency_minor_tencent @@ -119,7 +116,6 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) # final graph for yuan - # graph_yuan_only = graph_only_major_yuan | graph_only_cent | graph_only_tencent graph_yuan_only = graph_only_cent | graph_only_tencent graph_yuan_comb = graph_major_cent | graph_major_tencent | graph_tencent_cent | graph_major_minor diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py index 576db4206..eefb3e0ab 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py @@ -1,5 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py index f6fc63e48..5c89d51d6 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py @@ -1,5 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py index 9d294b8cd..0f22390c7 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py @@ -1,5 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -70,7 +69,6 @@ def __init__(self): graph_hours = hours + graph_delete_hours # graph for minutes - # graph_minutes = pynini.closure(pynutil.delete("分"), 0, 1) graph_minutes = pynutil.delete('分') graph_minutes = minutes + graph_minutes diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py index c41c288ff..e3b9b6d9e 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py @@ -1,5 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -54,7 +53,7 @@ class ClassifyFst(GraphFst): overwrite_cache: set to True to overwrite .far files """ - def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): + def __init__(self, cache_dir: str = None, whitelist: str=None, overwrite_cache: bool = False): super().__init__(name="tokenize_and_classify", kind="classify") far_file = None @@ -75,32 +74,23 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): decimal = DecimalFst(cardinal) decimal_graph = decimal.fst - # measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst - # date_graph = DateFst(ordinal=ordinal).fst - date = DateFst() - date_graph = date.fst + + date_graph = DateFst().fst word_graph = WordFst().fst time_graph = TimeFst().fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst fraction = FractionFst(cardinal) fraction_graph = fraction.fst - # whitelist_graph = WhiteListFst().fst punct_graph = PunctuationFst().fst - # electronic_graph = ElectronicFst().fst - # telephone_graph = TelephoneFst(cardinal).fst classify = ( - # pynutil.add_weight(whitelist_graph, 1.01) pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(date_graph, 1.09) | pynutil.add_weight(decimal_graph, 1.2) - # | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.1) - # | pynutil.add_weight(telephone_graph, 1.1) - # | pynutil.add_weight(electronic_graph, 1.1) | pynutil.add_weight(word_graph, 100) ) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/word.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/word.py index 84d1f4e16..3e129fb98 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/word.py @@ -1,5 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/utils.py b/nemo_text_processing/inverse_text_normalization/zh/utils.py index 6a8bd48cd..ca6210150 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/__init__.py index 4fc50543f..6ebc808fa 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py index ebf2f018b..55d906746 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py index 5bb150543..dcf0b2a36 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py @@ -1,5 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py index 4554769db..a71b93a50 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py index 5f8cd9d99..0fbe55b21 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py @@ -1,5 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py index 4f439a53e..2ddeaffd8 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py index 64c120a9e..8b15466c8 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py @@ -1,5 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py index 0537957a3..de0750f9c 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py @@ -1,5 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py index e0b04eb38..7aab9b49f 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py @@ -1,5 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -46,25 +45,18 @@ def __init__(self): decimal_graph = decimal.fst fraction = FractionFst() fraction_graph = fraction.fst - # measure_graph = MeasureFst(decimal=decimal, cardinal=cardinal).fst money = MoneyFst() money_graph = money.fst time_graph = TimeFst().fst date_graph = DateFst().fst - # whitelist_graph = WhiteListFst().fst - # telephone_graph = TelephoneFst().fst - # electronic_graph = ElectronicFst().fst + graph = ( time_graph | date_graph | money_graph | fraction_graph - # | measure_graph | ordinal_graph | decimal_graph | cardinal_graph - # | whitelist_graph - # | telephone_graph - # | electronic_graph ) self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py index 472da2e5a..e21b1d332 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py @@ -1,5 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py index 1e8654a2a..1456c4047 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py @@ -1,5 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/nemo_text_processing/zh/test_cardinal.py b/tests/nemo_text_processing/zh/test_cardinal.py index 17d9e2dc0..ebd00b16a 100644 --- a/tests/nemo_text_processing/zh/test_cardinal.py +++ b/tests/nemo_text_processing/zh/test_cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/nemo_text_processing/zh/test_date.py b/tests/nemo_text_processing/zh/test_date.py index d55e1d2a6..d9bc52032 100644 --- a/tests/nemo_text_processing/zh/test_date.py +++ b/tests/nemo_text_processing/zh/test_date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/nemo_text_processing/zh/test_decimal.py b/tests/nemo_text_processing/zh/test_decimal.py index f8f73785c..92af62a30 100644 --- a/tests/nemo_text_processing/zh/test_decimal.py +++ b/tests/nemo_text_processing/zh/test_decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/nemo_text_processing/zh/test_whitelist.py b/tests/nemo_text_processing/zh/test_whitelist.py deleted file mode 100644 index 075584b3b..000000000 --- a/tests/nemo_text_processing/zh/test_whitelist.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from parameterized import parameterized - -from ..utils import CACHE_DIR, parse_test_case_file - - -class TestWhitelist: - inverse_normalizer = InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) - - @parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_whitelist.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_word.py b/tests/nemo_text_processing/zh/test_word.py index 46057117f..4ba6ae9e5 100644 --- a/tests/nemo_text_processing/zh/test_word.py +++ b/tests/nemo_text_processing/zh/test_word.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,9 +21,9 @@ class TestWord: - inverse_normalizer = InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) - @parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_word.txt')) + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_word.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): From 9b61ce82f00d488cc3a6974993aa150fe776e327 Mon Sep 17 00:00:00 2001 From: Anand Joseph Date: Thu, 9 Feb 2023 21:15:07 +0530 Subject: [PATCH 03/89] Remove invalid tests Signed-off-by: Anand Joseph --- .../test_cases_word.txt | 49 ------------------- tests/nemo_text_processing/zh/test_word.py | 31 ------------ 2 files changed, 80 deletions(-) delete mode 100644 tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_word.txt delete mode 100644 tests/nemo_text_processing/zh/test_word.py diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_word.txt deleted file mode 100644 index 66f3445a0..000000000 --- a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_word.txt +++ /dev/null @@ -1,49 +0,0 @@ -~ -yahoo!~yahoo! -vingt!~20 ! -x ~x -—~— -aaa~aaa -aabach~aabach -aabenraa~aabenraa -aabye~aabye -aaccessed~aaccessed -aach~aach -aachen's~aachen's -aadri~aadri -aafia~aafia -aagaard~aagaard -aagadu~aagadu -aagard~aagard -aagathadi~aagathadi -aaghart's~aaghart's -aagnes~aagnes -aagomoni~aagomoni -aagon~aagon -aagoo~aagoo -aagot~aagot -aahar~aahar -aahh~aahh -aahperd~aahperd -aaibinterstate~aaibinterstate -aajab~aajab -aakasa~aakasa -aakervik~aakervik -aakirkeby~aakirkeby -aalam~aalam -aalbaek~aalbaek -aaldiu~aaldiu -aalem~aalem -a'ali~a'ali -aalilaassamthey~aalilaassamthey -aalin~aalin -aaliyan~aaliyan -aaliyan's~aaliyan's -aamadu~aamadu -aamara~aamara -aambala~aambala -aamera~aamera -aamer's~aamer's -aamina~aamina -aaminah~aaminah -aamjiwnaang~aamjiwnaang diff --git a/tests/nemo_text_processing/zh/test_word.py b/tests/nemo_text_processing/zh/test_word.py deleted file mode 100644 index 4ba6ae9e5..000000000 --- a/tests/nemo_text_processing/zh/test_word.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pytest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from parameterized import parameterized - -from ..utils import CACHE_DIR, parse_test_case_file - - -class TestWord: - inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) - - @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_word.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred == expected From cbb43793c7600d49860c907b64772ffa3684f793 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Feb 2023 15:51:23 +0000 Subject: [PATCH 04/89] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/zh/graph_utils.py | 1 - .../inverse_text_normalization/zh/taggers/cardinal.py | 1 - .../inverse_text_normalization/zh/taggers/fraction.py | 1 + .../inverse_text_normalization/zh/taggers/money.py | 2 +- .../inverse_text_normalization/zh/taggers/ordinal.py | 1 - .../inverse_text_normalization/zh/taggers/time.py | 2 +- .../zh/taggers/tokenize_and_classify.py | 5 ++--- .../zh/verbalizers/cardinal.py | 2 +- .../inverse_text_normalization/zh/verbalizers/date.py | 1 - .../zh/verbalizers/decimal.py | 2 +- .../zh/verbalizers/fraction.py | 1 - .../inverse_text_normalization/zh/verbalizers/money.py | 1 - .../zh/verbalizers/ordinal.py | 2 +- .../inverse_text_normalization/zh/verbalizers/time.py | 1 - .../zh/verbalizers/verbalize.py | 10 +--------- tests/nemo_text_processing/zh/test_date.py | 3 +-- tests/nemo_text_processing/zh/test_fraction.py | 3 ++- tests/nemo_text_processing/zh/test_money.py | 3 +-- tests/nemo_text_processing/zh/test_time.py | 2 +- 19 files changed, 14 insertions(+), 30 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py index e1ded9235..c6e703faf 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -54,7 +54,6 @@ ) - def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): """ Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index 5912bb3bb..43d701f3c 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -13,7 +13,6 @@ # limitations under the License. import pynini - from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path from pynini.lib import pynutil diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py index f82fb510c..db3312739 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py @@ -17,6 +17,7 @@ from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst from pynini.lib import pynutil + class FractionFst(GraphFst): """ Finite state transducer for classifying fraction diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py index 334b33ebe..900865c4c 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py @@ -13,11 +13,11 @@ # limitations under the License. import pynini - from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, GraphFst from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path from pynini.lib import pynutil + class MoneyFst(GraphFst): def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py index eefb3e0ab..6cc67ca43 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py @@ -13,7 +13,6 @@ # limitations under the License. import pynini - from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.zh.taggers.cardinal import CardinalFst from pynini.lib import pynutil diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py index 0f22390c7..57386da20 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py @@ -14,11 +14,11 @@ import pynini - from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path from pynini.lib import pynutil + class TimeFst(GraphFst): """ Finite state transducer for classifying time diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py index e3b9b6d9e..6cf3ab137 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py @@ -14,8 +14,8 @@ import logging import os -import pynini +import pynini from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( GraphFst, delete_extra_space, @@ -53,7 +53,7 @@ class ClassifyFst(GraphFst): overwrite_cache: set to True to overwrite .far files """ - def __init__(self, cache_dir: str = None, whitelist: str=None, overwrite_cache: bool = False): + def __init__(self, cache_dir: str = None, whitelist: str = None, overwrite_cache: bool = False): super().__init__(name="tokenize_and_classify", kind="classify") far_file = None @@ -74,7 +74,6 @@ def __init__(self, cache_dir: str = None, whitelist: str=None, overwrite_cache: decimal = DecimalFst(cardinal) decimal_graph = decimal.fst - date_graph = DateFst().fst word_graph = WordFst().fst time_graph = TimeFst().fst diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py index 55d906746..85f696fc7 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py @@ -13,7 +13,6 @@ # limitations under the License. import pynini - from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, @@ -22,6 +21,7 @@ ) from pynini.lib import pynutil + class CardinalFst(GraphFst): def __init__(self): super().__init__(name="cardinal", kind="verbalize") diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py index dcf0b2a36..dc44e45e8 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py @@ -13,7 +13,6 @@ # limitations under the License. import pynini - from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py index a71b93a50..f37d495eb 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py @@ -13,7 +13,6 @@ # limitations under the License. import pynini - from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( NEMO_DIGIT, NEMO_NOT_QUOTE, @@ -22,6 +21,7 @@ ) from pynini.lib import pynutil + class DecimalFst(GraphFst): def __init__(self): super().__init__(name="decimal", kind="verbalize") diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py index 0fbe55b21..4eaab1aa1 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py @@ -13,7 +13,6 @@ # limitations under the License. import pynini - from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, GraphFst, delete_space from pynini.lib import pynutil diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py index 2ddeaffd8..d8ca03a66 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py @@ -13,7 +13,6 @@ # limitations under the License. import pynini - from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from pynini.lib import pynutil diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py index 8b15466c8..7f65f3c69 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py @@ -13,7 +13,6 @@ # limitations under the License. import pynini - from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, @@ -22,6 +21,7 @@ ) from pynini.lib import pynutil + class OrdinalFst(GraphFst): def __init__(self): super().__init__(name="ordinal", kind="verbalize") diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py index de0750f9c..dacf082f9 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py @@ -13,7 +13,6 @@ # limitations under the License. import pynini - from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, GraphFst, delete_space from pynini.lib import pynutil diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py index 7aab9b49f..05bc66f70 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py @@ -50,13 +50,5 @@ def __init__(self): time_graph = TimeFst().fst date_graph = DateFst().fst - graph = ( - time_graph - | date_graph - | money_graph - | fraction_graph - | ordinal_graph - | decimal_graph - | cardinal_graph - ) + graph = time_graph | date_graph | money_graph | fraction_graph | ordinal_graph | decimal_graph | cardinal_graph self.fst = graph diff --git a/tests/nemo_text_processing/zh/test_date.py b/tests/nemo_text_processing/zh/test_date.py index d9bc52032..01d3e038b 100644 --- a/tests/nemo_text_processing/zh/test_date.py +++ b/tests/nemo_text_processing/zh/test_date.py @@ -13,9 +13,8 @@ # limitations under the License. import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer - +from nemo_text_processing.text_normalization.normalize import Normalizer from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file diff --git a/tests/nemo_text_processing/zh/test_fraction.py b/tests/nemo_text_processing/zh/test_fraction.py index d396dd1c8..264d64d13 100644 --- a/tests/nemo_text_processing/zh/test_fraction.py +++ b/tests/nemo_text_processing/zh/test_fraction.py @@ -31,9 +31,10 @@ def test_norm_fraction(self, test_input, expected): assert expected == preds inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_fraction.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred == expected \ No newline at end of file + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_money.py b/tests/nemo_text_processing/zh/test_money.py index 5ab059087..3d50ce5fa 100644 --- a/tests/nemo_text_processing/zh/test_money.py +++ b/tests/nemo_text_processing/zh/test_money.py @@ -30,7 +30,6 @@ def test_norm_money(self, test_input, expected): preds = self.normalizer_zh.normalize(test_input) assert expected == preds - inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_money.txt')) @@ -38,4 +37,4 @@ def test_norm_money(self, test_input, expected): @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred == expected \ No newline at end of file + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_time.py b/tests/nemo_text_processing/zh/test_time.py index 4b30efb99..80d79a78c 100644 --- a/tests/nemo_text_processing/zh/test_time.py +++ b/tests/nemo_text_processing/zh/test_time.py @@ -37,4 +37,4 @@ def test_norm_time(self, test_input, expected): @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred == expected \ No newline at end of file + assert pred == expected From 60f5f8c20ec4fc31c69f98e5ede3969de8a5ffd3 Mon Sep 17 00:00:00 2001 From: Anand Joseph Date: Thu, 9 Feb 2023 21:44:28 +0530 Subject: [PATCH 05/89] Resolve CodeQL issues Signed-off-by: Anand Joseph --- .../inverse_text_normalization/zh/graph_utils.py | 2 -- .../inverse_text_normalization/zh/taggers/fraction.py | 1 - .../inverse_text_normalization/zh/taggers/ordinal.py | 1 - .../inverse_text_normalization/zh/taggers/time.py | 1 - .../inverse_text_normalization/zh/verbalizers/time.py | 1 - 5 files changed, 6 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py index c6e703faf..fe002f247 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -19,9 +19,7 @@ from typing import Dict import pynini -from nemo_text_processing.text_normalization.en.utils import get_abs_path from pynini import Far -from pynini.examples import plurals from pynini.export import export from pynini.lib import byte, pynutil, utf8 diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py index db3312739..225ade447 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py @@ -13,7 +13,6 @@ # limitations under the License. -import pynini from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst from pynini.lib import pynutil diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py index 6cc67ca43..2b044d5b7 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py @@ -14,7 +14,6 @@ import pynini from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst -from nemo_text_processing.inverse_text_normalization.zh.taggers.cardinal import CardinalFst from pynini.lib import pynutil diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py index 57386da20..c6fd4e436 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py @@ -113,7 +113,6 @@ def __init__(self): graph_hours_to_component = graph_hours | graph_noon | graph_midnight # | graph_hours_count graph_hours_to_component @= hours_to # hours_to is the string_file data graph_hours_to_component = pynutil.insert("hours: \"") + graph_hours_to_component + pynutil.insert("\"") - graph_hours_to_component = graph_hours_to_component # converting minutes back graph_minutes_to_component = minutes | graph_half | graph_quarter | graph_three_quarter | graph_half_alt diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py index dacf082f9..159668f7b 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py @@ -65,7 +65,6 @@ def __init__(self): + pynini.accep("p.m.") + pynutil.delete("\"") ) - optional_affix_pm = pynutil.insert(" ") + pynini.closure(affix_pm, 0, 1) graph_pm = token_hour @ add_leading_zero + delete_space + pynutil.insert(":") + token_minute graph_pm_affix = ( token_hour @ add_leading_zero From b646ce7776a97ed5f155b838a9309d73022f6106 Mon Sep 17 00:00:00 2001 From: Anand Joseph Date: Fri, 10 Feb 2023 04:22:58 +0530 Subject: [PATCH 06/89] Cleanup Signed-off-by: Anand Joseph --- .../zh/clean_eval_data.py | 342 ------------------ 1 file changed, 342 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/zh/clean_eval_data.py diff --git a/nemo_text_processing/inverse_text_normalization/zh/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/zh/clean_eval_data.py deleted file mode 100644 index 2d3916560..000000000 --- a/nemo_text_processing/inverse_text_normalization/zh/clean_eval_data.py +++ /dev/null @@ -1,342 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from argparse import ArgumentParser -from typing import List - -import regex as re -from nemo_text_processing.text_normalization.data_loader_utils import ( - EOS_TYPE, - Instance, - load_files, - training_data_to_sentences, -) - - -""" -This file is for evaluation purposes. -filter_loaded_data() cleans data (list of instances) for inverse text normalization. Filters and cleaners can be specified for each semiotic class individually. -For example, normalized text should only include characters and whitespace characters but no punctuation. - Cardinal unnormalized instances should contain at least one integer and all other characters are removed. -""" - - -class Filter: - """ - Filter class - - Args: - class_type: semiotic class used in dataset - process_func: function to transform text - filter_func: function to filter text - - """ - - def __init__(self, class_type: str, process_func: object, filter_func: object): - self.class_type = class_type - self.process_func = process_func - self.filter_func = filter_func - - def filter(self, instance: Instance) -> bool: - """ - filter function - - Args: - filters given instance with filter function - - Returns: True if given instance fulfills criteria or does not belong to class type - """ - if instance.token_type != self.class_type: - return True - return self.filter_func(instance) - - def process(self, instance: Instance) -> Instance: - """ - process function - - Args: - processes given instance with process function - - Returns: processed instance if instance belongs to expected class type or original instance - """ - if instance.token_type != self.class_type: - return instance - return self.process_func(instance) - - -def filter_cardinal_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_cardinal_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r"[^0-9]", "", un_normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_ordinal_1(instance: Instance) -> bool: - ok = re.search(r"(st|nd|rd|th)\s*$", instance.un_normalized) - return ok - - -def process_ordinal_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r"[,\s]", "", un_normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_decimal_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_decimal_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - un_normalized = re.sub(r",", "", un_normalized) - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_measure_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_measure_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r",", "", un_normalized) - un_normalized = re.sub(r"m2", "m²", un_normalized) - un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized) - normalized = re.sub(r"[^a-z\s]", "", normalized) - normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_money_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_money_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r",", "", un_normalized) - un_normalized = re.sub(r"a\$", r"$", un_normalized) - un_normalized = re.sub(r"us\$", r"$", un_normalized) - un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized) - un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_time_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_time_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - un_normalized = re.sub(r": ", ":", un_normalized) - un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized) - un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized) - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_plain_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_plain_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_punct_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_punct_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_date_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_date_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - un_normalized = re.sub(r",", "", un_normalized) - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_letters_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_letters_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_verbatim_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_verbatim_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_digit_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_digit_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_telephone_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_telephone_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_electronic_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_electronic_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_fraction_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_fraction_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_address_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_address_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -filters = [] -filters.append(Filter(class_type="CARDINAL", process_func=process_cardinal_1, filter_func=filter_cardinal_1)) -filters.append(Filter(class_type="ORDINAL", process_func=process_ordinal_1, filter_func=filter_ordinal_1)) -filters.append(Filter(class_type="DECIMAL", process_func=process_decimal_1, filter_func=filter_decimal_1)) -filters.append(Filter(class_type="MEASURE", process_func=process_measure_1, filter_func=filter_measure_1)) -filters.append(Filter(class_type="MONEY", process_func=process_money_1, filter_func=filter_money_1)) -filters.append(Filter(class_type="TIME", process_func=process_time_1, filter_func=filter_time_1)) - -filters.append(Filter(class_type="DATE", process_func=process_date_1, filter_func=filter_date_1)) -filters.append(Filter(class_type="PLAIN", process_func=process_plain_1, filter_func=filter_plain_1)) -filters.append(Filter(class_type="PUNCT", process_func=process_punct_1, filter_func=filter_punct_1)) -filters.append(Filter(class_type="LETTERS", process_func=process_letters_1, filter_func=filter_letters_1)) -filters.append(Filter(class_type="VERBATIM", process_func=process_verbatim_1, filter_func=filter_verbatim_1)) -filters.append(Filter(class_type="DIGIT", process_func=process_digit_1, filter_func=filter_digit_1)) -filters.append(Filter(class_type="TELEPHONE", process_func=process_telephone_1, filter_func=filter_telephone_1)) -filters.append(Filter(class_type="ELECTRONIC", process_func=process_electronic_1, filter_func=filter_electronic_1)) -filters.append(Filter(class_type="FRACTION", process_func=process_fraction_1, filter_func=filter_fraction_1)) -filters.append(Filter(class_type="ADDRESS", process_func=process_address_1, filter_func=filter_address_1)) -filters.append(Filter(class_type=EOS_TYPE, process_func=lambda x: x, filter_func=lambda x: True)) - - -def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]: - """ - Filters list of instances - - Args: - data: list of instances - - Returns: filtered and transformed list of instances - """ - updates_instances = [] - for instance in data: - updated_instance = False - for fil in filters: - if fil.class_type == instance.token_type and fil.filter(instance): - instance = fil.process(instance) - updated_instance = True - if updated_instance: - if verbose: - print(instance) - updates_instances.append(instance) - return updates_instances - - -def parse_args(): - parser = ArgumentParser() - parser.add_argument("--input", help="input file path", type=str, default='./en_with_types/output-00001-of-00100') - parser.add_argument("--verbose", help="print filtered instances", action='store_true') - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - file_path = args.input - - print("Loading training data: " + file_path) - instance_list = load_files([file_path]) # List of instances - filtered_instance_list = filter_loaded_data(instance_list, args.verbose) - training_data_to_sentences(filtered_instance_list) From f2366f24e889b157a5c5aa7ec2eccd2b65c3e698 Mon Sep 17 00:00:00 2001 From: Anand Joseph Date: Fri, 10 Feb 2023 04:26:36 +0530 Subject: [PATCH 07/89] Fix missing 'zh' option for ITN and correct comment Signed-off-by: Anand Joseph --- .../inverse_text_normalization/inverse_normalize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index d11c92605..a992fd60e 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -90,7 +90,7 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ar.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) - elif lang == 'zh': # Arabic + elif lang == 'zh': # Mandarin from nemo_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.zh.verbalizers.verbalize_final import ( VerbalizeFinalFst, @@ -135,7 +135,7 @@ def parse_args(): input.add_argument("--input_file", dest="input_file", help="input file path", type=str) parser.add_argument('--output_file', dest="output_file", help="output file path", type=str) parser.add_argument( - "--language", help="language", choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'vi'], default="en", type=str + "--language", help="language", choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'vi', 'zh'], default="en", type=str ) parser.add_argument( "--whitelist", From 0e24e4312cf95bb565b9dd139495df186b354414 Mon Sep 17 00:00:00 2001 From: "Buyuan(Alex) Cui" <69030297+BuyuanCui@users.noreply.github.com> Date: Wed, 1 Mar 2023 10:05:56 -0800 Subject: [PATCH 08/89] Update __init__.py Change to zh instead of en for the imports. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> --- .../inverse_text_normalization/zh/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/__init__.py b/nemo_text_processing/inverse_text_normalization/zh/__init__.py index 4a73fa2f5..ab4301382 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/zh/__init__.py @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import ClassifyFst -from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.verbalize_final import VerbalizeFinalFst From 64f37c0e4c9ffadd690ee13d3fcaadf704813be5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Mar 2023 21:51:35 +0000 Subject: [PATCH 09/89] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/inverse_normalize.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index 4ecb2d326..33d3a14ab 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -143,7 +143,11 @@ def parse_args(): input.add_argument("--input_file", dest="input_file", help="input file path", type=str) parser.add_argument('--output_file', dest="output_file", help="output file path", type=str) parser.add_argument( - "--language", help="language", choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'vi', 'ar', 'zh'], default="en", type=str + "--language", + help="language", + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'vi', 'ar', 'zh'], + default="en", + type=str, ) parser.add_argument( "--input_case", From a9d3ec42fd66301280d3b6f520a2416fb8eb8381 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 9 Mar 2023 13:59:53 -0800 Subject: [PATCH 10/89] update for decimal test data Signed-off-by: BuyuanCui --- .../zh/data_inverse_text_normalization/test_cases_decimal.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_decimal.txt index 5672ea60a..fec09a461 100644 --- a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_decimal.txt +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_decimal.txt @@ -9,7 +9,7 @@ 三千点五~3,000.5 四万点六~40,000.6 一點零零五~1.005 -九十九點零零零五~99.000,5 +九十九點零零零五~99.0005 一百點五七三五~100.573,5 一千五百点零一~1,500.01 负五万点二四五~-50,000.245 From 04f1aeeff70f696cfab064a327c17c43a165f4ad Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 14 Mar 2023 09:16:48 -0700 Subject: [PATCH 11/89] update for langauge import Signed-off-by: BuyuanCui --- .../zh/graph_utils.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py index fe002f247..a9c3367d0 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -51,6 +51,8 @@ | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"")) ) +INPUT_CASED = "cased" +INPUT_LOWER_CASED = "lower_cased" def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): """ @@ -81,6 +83,44 @@ def convert_space(fst) -> 'pynini.FstLike': return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) +def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): + labels = load_labels(input_file) + + if input_case == INPUT_CASED: + additional_labels = [] + for written, spoken, *weight in labels: + written_capitalized = written[0].upper() + written[1:] + additional_labels.extend( + [ + [written_capitalized, spoken.capitalize()], # first letter capitalized + [ + written_capitalized, + spoken.upper().replace(" AND ", " and "), + ], # # add pairs with the all letters capitalized + ] + ) + + spoken_no_space = spoken.replace(" ", "") + # add abbreviations without spaces (both lower and upper case), i.e. "BMW" not "B M W" + if len(spoken) == (2 * len(spoken_no_space) - 1): + print(f"This is weight {weight}") + if len(weight) == 0: + additional_labels.extend( + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]] + ) + else: + additional_labels.extend( + [ + [written, spoken_no_space, weight[0]], + [written_capitalized, spoken_no_space.upper(), weight[0]], + ] + ) + labels += additional_labels + + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + class GraphFst: """ Base class for all grammar fsts. From cbeeba0c2f2fad52affed71d2ef711ab277d86e4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 14 Mar 2023 16:17:06 +0000 Subject: [PATCH 12/89] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/zh/graph_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py index a9c3367d0..ff739e428 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -54,6 +54,7 @@ INPUT_CASED = "cased" INPUT_LOWER_CASED = "lower_cased" + def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): """ Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. From 4b0cad3d19beb3a54edb2d84e4842fd479939f45 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 14 Mar 2023 09:17:29 -0700 Subject: [PATCH 13/89] update for Chinese punctuations Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/taggers/punctuation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py index 5c89d51d6..74c098ed9 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py @@ -26,7 +26,7 @@ class PunctuationFst(GraphFst): def __init__(self): super().__init__(name="punctuation", kind="classify") - s = "!#$%&\'()*+,-./:;<=>?@^_`{|}~" + s = "!#$%&\'()*+,-./:;<=>?@^_`{|}~。,;:《》“”·~【】!?、‘’.<>-——_" punct = pynini.union(*s) graph = pynutil.insert("name: \"") + punct + pynutil.insert("\"") From ba8e11086dfd3521e1a6619d57108e5e884c247c Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 14 Mar 2023 09:18:33 -0700 Subject: [PATCH 14/89] a new class for whitelist Signed-off-by: BuyuanCui --- .../zh/data/whitelist.tsv | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/whitelist.tsv diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/whitelist.tsv new file mode 100644 index 000000000..33cd63758 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/whitelist.tsv @@ -0,0 +1,21 @@ +人力资源 HR +自动取款机 ATM +人力资源 HR +首席执行官 CEO +美国研究生入学考试 GRE +研究生管理专业入学考试 GMAT +全球定位系统 GPS +刷卡机 POS机 +数位多功能光碟 DVD +镭射唱片 CD +通用串行总线 USB +统一资源定位符 URL +虚拟专用网络 VPN +网络互联协议 IP +脱氧核糖核酸 DNA +核糖核酸 RNA +平均学分绩点 GPA +发光二极管 LED +可移植文档格式 PDF +社会性网络服务 SNS +博士 PhD From 992a64469e3b012b720bd472c3bece18be944617 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Mon, 27 Mar 2023 11:07:47 -0700 Subject: [PATCH 15/89] PYNINI_AVAILABLE = False Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/utils.py | 47 ------------------- 1 file changed, 47 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/zh/utils.py diff --git a/nemo_text_processing/inverse_text_normalization/zh/utils.py b/nemo_text_processing/inverse_text_normalization/zh/utils.py deleted file mode 100644 index ca6210150..000000000 --- a/nemo_text_processing/inverse_text_normalization/zh/utils.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from typing import Union - -import inflect - -_inflect = inflect.engine() - - -def num_to_word(x: Union[str, int]): - """ - converts integer to spoken representation - - Args - x: integer - - Returns: spoken representation - """ - if isinstance(x, int): - x = str(x) - x = _inflect.number_to_words(str(x)).replace("-", " ").replace(",", "") - return x - - -def get_abs_path(rel_path): - """ - Get absolute path - - Args: - rel_path: relative path to this file - - Returns absolute path - """ - return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path From b8134fb04cae1e783ffec90473e045f245516fd4 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:23:32 -0700 Subject: [PATCH 16/89] recreated due to file import format issue Signed-off-by: BuyuanCui --- .../zh/data/date/day-nano.tsv | 74 ------------------- 1 file changed, 74 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/date/day-nano.tsv diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/date/day-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/date/day-nano.tsv deleted file mode 100644 index fd3e3ddab..000000000 --- a/nemo_text_processing/inverse_text_normalization/zh/data/date/day-nano.tsv +++ /dev/null @@ -1,74 +0,0 @@ -一 1 -二 2 -三 3 -四 4 -五 5 -六 6 -七 7 -八 8 -九 9 -十 10 -十一 11 -十二 12 -十三 13 -十四 14 -十五 15 -十六 16 -十七 17 -十八 18 -十九 19 -二十 20 -二十一 21 -二十二 22 -二十三 23 -二十四 24 -二十五 25 -二十六 26 -二十七 27 -二十八 28 -二十九 29 -三十 30 -三十一 31 -壹 1 -貳 2 -參 3 -肆 4 -伍 5 -陸 6 -柒 7 -捌 8 -玖 9 -幺 1 -两 2 -兩 2 -拾 10 -拾壹 11 -拾貳 12 -拾叁 13 -拾肆 14 -拾伍 15 -拾陸 16 -拾柒 17 -拾捌 18 -拾玖 19 -貳拾 20 -貳拾壹 21 -貳拾貳 22 -貳拾叁 23 -貳拾肆 24 -貳拾伍 25 -貳拾陸 26 -貳拾柒 27 -貳拾捌 28 -貳拾玖 29 -叁拾 30 -叁拾壹 31 -壹 1 -拾壹 11 -贰拾壹 21 -贰 2 -陆 6 -拾贰 12 -拾陆 16 -贰拾贰 22 -贰拾陆 26 \ No newline at end of file From f2bd6d20d5be21ece9f010b038bd2ebebf027c5a Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:24:02 -0700 Subject: [PATCH 17/89] recreated due to format issue Signed-off-by: BuyuanCui --- .../zh/data/date/month-nano.tsv | 49 ------------------- 1 file changed, 49 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/date/month-nano.tsv diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/date/month-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/date/month-nano.tsv deleted file mode 100644 index 5b2f33539..000000000 --- a/nemo_text_processing/inverse_text_normalization/zh/data/date/month-nano.tsv +++ /dev/null @@ -1,49 +0,0 @@ -一 1 -二 2 -三 3 -四 4 -五 5 -六 6 -七 7 -八 8 -九 9 -十 10 -十一 11 -十二 12 -一十 10 -零一 1 -零二 2 -零三 3 -零四 4 -零五 5 -零六 6 -零七 7 -零八 8 -零九 9 -壹 1 -贰 2 -叁 3 -肆 4 -伍 5 -陆 6 -柒 7 -捌 8 -玖 9 -拾 10 -拾壹 11 -拾贰 12 -壹拾 10 -零壹 1 -零贰 2 -零叁 3 -零肆 4 -零伍 5 -零陆 6 -零柒 7 -零捌 8 -零玖 9 -貳 2 -零貳 2 -陸 6 -零陸 6 -拾貳 12 \ No newline at end of file From fc17b3ab185c27a572afaad149ac6bd1be3a1df3 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:24:58 -0700 Subject: [PATCH 18/89] caught duplicates, removed Signed-off-by: BuyuanCui --- .../zh/data/money/currency_major-nano.tsv | 77 ------------------- 1 file changed, 77 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major-nano.tsv diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major-nano.tsv deleted file mode 100644 index 22d7a0579..000000000 --- a/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major-nano.tsv +++ /dev/null @@ -1,77 +0,0 @@ -美元 US$ -欧元 € -歐元 € -英镑 £ -英鎊 £ -加拿大元 CAD$ -加拿大币 CAD$ -加拿大幣 CAD$ -加元 CAD$ -加币 CAD$ -加幣 CAD$ -瑞士法郎 Fr -法郎 ₣ -圆 ¥ -圓 ¥ -瑞典克朗 Kr -墨西哥比索 NXN$ -新西兰元 NZD$ -新西蘭元 NZD$ -新加坡币 SGD$ -新加坡幣 SGD$ -新加坡元 SGD$ -港元 HKD$ -港币 HKD$ -港幣 HKD$ -挪威克朗 NOKkr -韩元 ₩ -韓元 ₩ -韩币 ₩ -韓幣 ₩ -土耳其里拉 TRY₺ -印度卢布 ₹ -印度盧布 ₹ -印度卢比 ₹ -印度盧比 ₹ -俄罗斯卢布 ₽ -俄羅斯盧布 ₽ -俄罗斯卢比 ₽ -俄羅斯盧比 ₽ -巴西雷亚尔 BRLR$ -巴西雷亞爾 BRLR$ -南非兰特 R -南非蘭特 R -丹麦克朗 DKKkr -丹麥克朗 DKKkr -波兰兹罗提 zł -波蘭兹儸提 zł -新台币 TWDNT$ -新臺幣 TWDNT$ -泰铢 ฿ -泰銖 ฿ -马来西亚林吉特 RM -馬來西亞林吉特 RM -印尼盾 Rp -匈牙利福林 Ft -捷克克朗 Kč -以色列新谢克尔 ₪ -以色列新謝克爾 ₪ -智利披索 CLP$ -菲律宾披索 ₱ -菲律賓披索 ₱ -阿联酋迪拉姆 د.إ -阿聯酋迪拉姆 د.إ -哥伦比亚披索 COL$ -哥倫比亞披索 COL$ -马来西亚令吉 RM -馬來西亞令吉 RM -罗马尼亚列伊 L -羅馬尼亞列伊 L -日元 JPY¥ -日圆 JPY¥ -日圓 JPY¥ -元 ¥ -圓 ¥ -圆 ¥ -人民币 ¥ -人民幣 ¥ \ No newline at end of file From fe52b294fc988c130d59b261d81060e60aec8e06 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:26:27 -0700 Subject: [PATCH 19/89] removed duplicates, arranges for CHInese Yuan updates Signed-off-by: BuyuanCui --- .../zh/data/money/currency_major.tsv | 75 +++++++++++++++++++ ...ency_minor-nano.tsv => currency_minor.tsv} | 1 - ...t-nano.tsv => currency_rmb_minor_cent.tsv} | 0 ...ano.tsv => currency_rmb_minor_tencent.tsv} | 0 4 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major.tsv rename nemo_text_processing/inverse_text_normalization/zh/data/money/{currency_minor-nano.tsv => currency_minor.tsv} (88%) rename nemo_text_processing/inverse_text_normalization/zh/data/money/{currency_rmb_minor_cent-nano.tsv => currency_rmb_minor_cent.tsv} (100%) rename nemo_text_processing/inverse_text_normalization/zh/data/money/{currency_rmb_minor_tencent-nano.tsv => currency_rmb_minor_tencent.tsv} (100%) diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major.tsv new file mode 100644 index 000000000..d9b1a6c8f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major.tsv @@ -0,0 +1,75 @@ +美元 US$ +欧元 € +歐元 € +英镑 £ +英鎊 £ +加拿大元 CAD$ +加拿大币 CAD$ +加拿大幣 CAD$ +加元 CAD$ +加币 CAD$ +加幣 CAD$ +瑞士法郎 Fr +法郎 ₣ +圆 ¥ +圓 ¥ +瑞典克朗 Kr +墨西哥比索 NXN$ +新西兰元 NZD$ +新西蘭元 NZD$ +新加坡币 SGD$ +新加坡幣 SGD$ +新加坡元 SGD$ +港元 HKD$ +港币 HKD$ +港幣 HKD$ +挪威克朗 NOKkr +韩元 ₩ +韓元 ₩ +韩币 ₩ +韓幣 ₩ +土耳其里拉 TRY₺ +印度卢布 ₹ +印度盧布 ₹ +印度卢比 ₹ +印度盧比 ₹ +俄罗斯卢布 ₽ +俄羅斯盧布 ₽ +俄罗斯卢比 ₽ +俄羅斯盧比 ₽ +巴西雷亚尔 BRLR$ +巴西雷亞爾 BRLR$ +南非兰特 R +南非蘭特 R +丹麦克朗 DKKkr +丹麥克朗 DKKkr +波兰兹罗提 zł +波蘭兹儸提 zł +新台币 TWDNT$ +新臺幣 TWDNT$ +泰铢 ฿ +泰銖 ฿ +马来西亚林吉特 RM +馬來西亞林吉特 RM +印尼盾 Rp +匈牙利福林 Ft +捷克克朗 Kč +以色列新谢克尔 ₪ +以色列新謝克爾 ₪ +智利披索 CLP$ +菲律宾披索 ₱ +菲律賓披索 ₱ +阿联酋迪拉姆 د.إ +阿聯酋迪拉姆 د.إ +哥伦比亚披索 COL$ +哥倫比亞披索 COL$ +马来西亚令吉 RM +馬來西亞令吉 RM +罗马尼亚列伊 L +羅馬尼亞列伊 L +日元 JPY¥ +日圆 JPY¥ +日圓 JPY¥ +人民币 ¥ +人民幣 ¥ +元 ¥ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_minor-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_minor.tsv similarity index 88% rename from nemo_text_processing/inverse_text_normalization/zh/data/money/currency_minor-nano.tsv rename to nemo_text_processing/inverse_text_normalization/zh/data/money/currency_minor.tsv index f39777e21..d0451613a 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_minor-nano.tsv +++ b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_minor.tsv @@ -7,4 +7,3 @@ 分 NXN$ 新西兰仙 NZD$ 挪威欧尔 NOKOre -分 ¥ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_cent-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_cent.tsv similarity index 100% rename from nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_cent-nano.tsv rename to nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_cent.tsv diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_tencent-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_tencent.tsv similarity index 100% rename from nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_tencent-nano.tsv rename to nemo_text_processing/inverse_text_normalization/zh/data/money/currency_rmb_minor_tencent.tsv From 63ee92a2550a9726bfce5dffed025ca760048067 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:27:33 -0700 Subject: [PATCH 20/89] updates accordingly to the comments from last PR. Recreated some of the files due to format issues Signed-off-by: BuyuanCui --- .../zh/data/numbers/digit-nano.tsv | 6 ++--- .../zh/data/numbers/tens-nano.tsv | 23 ------------------- .../{tens_re-nano.tsv => ties-nano.tsv} | 2 -- .../data/numbers/{zero-nano.tsv => zero.tsv} | 0 4 files changed, 2 insertions(+), 29 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/numbers/tens-nano.tsv rename nemo_text_processing/inverse_text_normalization/zh/data/numbers/{tens_re-nano.tsv => ties-nano.tsv} (90%) rename nemo_text_processing/inverse_text_normalization/zh/data/numbers/{zero-nano.tsv => zero.tsv} (100%) diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/digit-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/digit-nano.tsv index 04a36a2ce..d6bb500ae 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/digit-nano.tsv +++ b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/digit-nano.tsv @@ -8,17 +8,15 @@ 八 8 九 9 壹 1 -貳 2 贰 2 -參 3 叁 3 肆 4 伍 5 -陸 6 陆 6 柒 7 捌 8 玖 9 -幺 1 +貳 2 +陸 6 两 2 兩 2 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/tens-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/tens-nano.tsv deleted file mode 100644 index a390e08d9..000000000 --- a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/tens-nano.tsv +++ /dev/null @@ -1,23 +0,0 @@ -十 1 -一十 1 -二十 2 -三十 3 -四十 4 -五十 5 -六十 6 -七十 7 -八十 8 -九十 9 -拾 1 -壹拾 1 -貳拾 2 -贰拾 2 -叁拾 3 -參拾 3 -肆拾 4 -伍拾 5 -陸拾 6 -陆拾 6 -柒拾 7 -捌拾 8 -玖拾 9 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/tens_re-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/ties-nano.tsv similarity index 90% rename from nemo_text_processing/inverse_text_normalization/zh/data/numbers/tens_re-nano.tsv rename to nemo_text_processing/inverse_text_normalization/zh/data/numbers/ties-nano.tsv index 58dbe8879..d4ed9d9ef 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/tens_re-nano.tsv +++ b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/ties-nano.tsv @@ -1,4 +1,3 @@ -一十 1 二十 2 三十 3 四十 4 @@ -7,7 +6,6 @@ 七十 7 八十 8 九十 9 -壹拾 1 贰拾 2 叁拾 3 肆拾 4 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/numbers/zero-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/numbers/zero.tsv similarity index 100% rename from nemo_text_processing/inverse_text_normalization/zh/data/numbers/zero-nano.tsv rename to nemo_text_processing/inverse_text_normalization/zh/data/numbers/zero.tsv From 1481d2c794fd487fcbf94b130ad98a1823d0f8b7 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:29:27 -0700 Subject: [PATCH 21/89] removed the hours_to and minute_to files used for back counting. ALso removed am and pm suffix files according to the last PR. Recreated some of them for format issue Signed-off-by: BuyuanCui --- .../zh/data/time/hours_to-nano.tsv | 25 --- .../zh/data/time/minutes_to-nano.tsv | 59 ------- .../{time_hours-nano.tsv => time_hours.tsv} | 46 +++--- .../zh/data/time/time_mandarin.tsv | 60 ++++++++ ...time_minutes-nano.tsv => time_minutes.tsv} | 1 + .../zh/data/time/time_quarters.tsv | 9 ++ .../zh/data/time/time_seconds.tsv | 144 ++++++++++++++++++ 7 files changed, 237 insertions(+), 107 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/time/hours_to-nano.tsv delete mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/time/minutes_to-nano.tsv rename nemo_text_processing/inverse_text_normalization/zh/data/time/{time_hours-nano.tsv => time_hours.tsv} (66%) create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/time/time_mandarin.tsv rename nemo_text_processing/inverse_text_normalization/zh/data/time/{time_minutes-nano.tsv => time_minutes.tsv} (99%) create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/time/time_quarters.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/time/time_seconds.tsv diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/hours_to-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/time/hours_to-nano.tsv deleted file mode 100644 index a56219579..000000000 --- a/nemo_text_processing/inverse_text_normalization/zh/data/time/hours_to-nano.tsv +++ /dev/null @@ -1,25 +0,0 @@ -1 0 -2 1 -3 2 -4 3 -5 4 -6 5 -7 6 -8 7 -9 8 -10 9 -11 10 -12 11 -13 12 -14 13 -15 14 -16 15 -17 16 -18 17 -19 18 -20 19 -21 20 -22 21 -23 22 -24 23 -0 23 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/minutes_to-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/time/minutes_to-nano.tsv deleted file mode 100644 index 11ac8f2a9..000000000 --- a/nemo_text_processing/inverse_text_normalization/zh/data/time/minutes_to-nano.tsv +++ /dev/null @@ -1,59 +0,0 @@ -59 01 -58 02 -57 03 -56 04 -55 05 -54 06 -53 07 -52 08 -51 09 -50 10 -49 11 -48 12 -47 13 -46 14 -45 15 -44 16 -43 17 -42 18 -41 19 -40 20 -39 21 -38 22 -37 23 -36 24 -35 25 -34 26 -33 27 -32 28 -31 29 -30 30 -29 31 -28 32 -27 33 -26 34 -25 35 -24 36 -23 37 -22 38 -21 39 -20 40 -19 41 -18 42 -17 43 -16 44 -15 45 -14 46 -13 47 -12 48 -11 49 -10 50 -09 51 -08 52 -07 53 -06 54 -05 55 -04 56 -03 57 -02 58 -01 59 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/time_hours-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_hours.tsv similarity index 66% rename from nemo_text_processing/inverse_text_normalization/zh/data/time/time_hours-nano.tsv rename to nemo_text_processing/inverse_text_normalization/zh/data/time/time_hours.tsv index 4a00dc817..82a20bfea 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/data/time/time_hours-nano.tsv +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_hours.tsv @@ -1,12 +1,12 @@ -一 1 -二 2 -三 3 -四 4 -五 5 -六 6 -七 7 -八 8 -九 9 +一 01 +二 02 +三 03 +四 04 +五 05 +六 06 +七 07 +八 08 +九 09 十 10 十一 11 十二 12 @@ -22,15 +22,15 @@ 二十二 22 二十三 23 二十四 24 -壹 1 -貳 2 -參 3 -肆 4 -伍 5 -陸 6 -柒 7 -捌 8 -玖 9 +壹 01 +貳 02 +參 03 +肆 04 +伍 05 +陸 06 +柒 07 +捌 08 +玖 09 拾 10 拾壹 11 拾貳 12 @@ -46,10 +46,10 @@ 二十二 22 二十三 23 二十四 24 -兩 2 -两 2 -贰 2 -陆 6 +兩 02 +两 02 +贰 02 +陆 06 拾贰 12 拾陆 16 -贰拾贰 22 \ No newline at end of file +贰拾贰 22 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/time_mandarin.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_mandarin.tsv new file mode 100644 index 000000000..27bc5539a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_mandarin.tsv @@ -0,0 +1,60 @@ +一 1 +二 2 +三 3 +四 4 +五 5 +六 6 +七 7 +八 8 +九 9 +十 10 +十一 11 +十二 12 +十三 13 +十四 14 +十五 15 +十六 16 +十七 17 +十八 18 +十九 19 +二十 20 +二十一 21 +二十二 22 +二十三 23 +二十四 24 +二十五 25 +二十六 26 +二十七 27 +二十八 28 +二十九 29 +三十 30 +三十一 31 +三十二 32 +三十三 33 +三十四 34 +三十五 35 +三十六 36 +三十七 37 +三十八 38 +三十九 39 +四十 40 +四十一 41 +四十二 42 +四十三 43 +四十四 44 +四十五 45 +四十六 46 +四十七 47 +四十八 48 +四十九 49 +五十 50 +五十一 51 +五十二 52 +五十三 53 +五十四 54 +五十五 55 +五十六 56 +五十七 57 +五十八 58 +五十九 59 +六十 60 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/time_minutes-nano.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_minutes.tsv similarity index 99% rename from nemo_text_processing/inverse_text_normalization/zh/data/time/time_minutes-nano.tsv rename to nemo_text_processing/inverse_text_normalization/zh/data/time/time_minutes.tsv index 808d9394d..081e2226b 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/data/time/time_minutes-nano.tsv +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_minutes.tsv @@ -161,3 +161,4 @@ 肆拾陸 46 伍拾陸 56 陸拾 60 +零 00 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/time_quarters.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_quarters.tsv new file mode 100644 index 000000000..099f1fedc --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_quarters.tsv @@ -0,0 +1,9 @@ +一 1 +二 2 +三 3 +四 4 +壹 1 +贰 2 +叁 3 +肆 4 +貳 2 diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/time_seconds.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_seconds.tsv new file mode 100644 index 000000000..fa4fc9dd8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_seconds.tsv @@ -0,0 +1,144 @@ +一 01 +二 02 +三 03 +四 04 +五 05 +六 06 +七 07 +八 08 +九 09 +十 10 +十一 11 +十二 12 +十三 13 +十四 14 +十五 15 +十六 16 +十七 17 +十八 18 +十九 19 +二十 20 +二十一 21 +二十二 22 +二十三 23 +二十四 24 +二十五 25 +二十六 26 +二十七 27 +二十八 28 +二十九 29 +三十 30 +三十一 31 +三十二 32 +三十三 33 +三十四 34 +三十五 35 +三十六 36 +三十七 37 +三十八 38 +三十九 39 +四十 40 +四十一 41 +四十二 42 +四十三 43 +四十四 44 +四十五 45 +四十六 46 +四十七 47 +四十八 48 +四十九 49 +五十 50 +五十一 51 +五十二 52 +五十三 53 +五十四 54 +五十五 55 +五十六 56 +五十七 57 +五十八 58 +五十九 59 +六十 60 +壹 01 +貳 02 +叁 03 +肆 04 +伍 05 +陆 06 +柒 07 +捌 08 +玖 09 +拾 10 +拾壹 11 +拾贰 12 +拾叁 13 +拾肆 14 +拾伍 15 +拾陆 16 +拾柒 17 +拾捌 18 +拾玖 19 +贰拾 20 +贰拾壹 21 +贰拾贰 22 +贰拾叁 23 +贰拾肆 24 +贰拾伍 25 +贰拾陆 26 +贰拾柒 27 +贰拾捌 28 +贰拾玖 29 +叁拾 30 +叁拾壹 31 +叁拾贰 32 +叁拾叁 33 +叁拾肆 34 +叁拾伍 35 +叁拾陆 36 +叁拾柒 37 +叁拾捌 38 +叁拾玖 39 +肆拾 40 +肆拾壹 41 +肆拾贰 42 +肆拾叁 43 +肆拾肆 44 +肆拾伍 45 +肆拾陆 46 +肆拾柒 47 +肆拾捌 48 +肆拾玖 49 +伍拾 50 +伍拾壹 51 +伍拾贰 52 +伍拾叁 53 +伍拾肆 54 +伍拾伍 55 +伍拾陆 56 +伍拾柒 57 +伍拾捌 58 +伍拾玖 59 +陆拾 60 +貳 02 +陸 06 +兩 02 +两 02 +拾貳 12 +拾陸 16 +貳拾 20 +貳拾壹 21 +貳拾貳 22 +貳拾叁 23 +貳拾肆 24 +貳拾伍 25 +貳拾陸 26 +貳拾柒 27 +貳拾捌 28 +二室玖 29 +叁拾貳 32 +叁拾陸 36 +肆拾貳 42 +肆拾陸 46 +伍拾貳 52 +伍拾陸 56 +陸拾 60 +零 00 From d40a49987a6557ceee50923dd7715af7847507d2 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:30:35 -0700 Subject: [PATCH 22/89] re-added this file to avoid data file import error Signed-off-by: BuyuanCui --- .../zh/graph_utils.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py index a9c3367d0..8ec83f113 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -31,9 +31,9 @@ NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() NEMO_HEX = pynini.union(*string.hexdigits).optimize() -NEMO_NON_BREAKING_SPACE = u"\u00A0" +NEMO_NON_BREAKING_SPACE = "\u00A0" NEMO_SPACE = " " -NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() @@ -48,13 +48,14 @@ delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") delete_preserve_order = pynini.closure( pynutil.delete(" preserve_order: true") - | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"")) + | (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"')) ) INPUT_CASED = "cased" INPUT_LOWER_CASED = "lower_cased" -def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): + +def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): """ Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. @@ -66,10 +67,10 @@ def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): for rule, graph in graphs.items(): exporter[rule] = graph.optimize() exporter.close() - print(f'Created {file_name}') + print(f"Created {file_name}") -def convert_space(fst) -> 'pynini.FstLike': +def convert_space(fst) -> "pynini.FstLike": """ Converts space to nonbreaking space. Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" @@ -92,7 +93,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): written_capitalized = written[0].upper() + written[1:] additional_labels.extend( [ - [written_capitalized, spoken.capitalize()], # first letter capitalized + [written_capitalized, spoken.capitalize(),], # first letter capitalized [ written_capitalized, spoken.upper().replace(" AND ", " and "), @@ -106,7 +107,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): print(f"This is weight {weight}") if len(weight) == 0: additional_labels.extend( - [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]] + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],] ) else: additional_labels.extend( @@ -138,7 +139,7 @@ def __init__(self, name: str, kind: str, deterministic: bool = True): self._fst = None self.deterministic = deterministic - self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far') + self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far") if self.far_exist(): self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() @@ -149,14 +150,14 @@ def far_exist(self) -> bool: return self.far_path.exists() @property - def fst(self) -> 'pynini.FstLike': + def fst(self) -> "pynini.FstLike": return self._fst @fst.setter def fst(self, fst): self._fst = fst - def add_tokens(self, fst) -> 'pynini.FstLike': + def add_tokens(self, fst) -> "pynini.FstLike": """ Wraps class name around to given fst @@ -168,7 +169,7 @@ def add_tokens(self, fst) -> 'pynini.FstLike': """ return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") - def delete_tokens(self, fst) -> 'pynini.FstLike': + def delete_tokens(self, fst) -> "pynini.FstLike": """ Deletes class name wrap around output of given fst @@ -187,4 +188,4 @@ def delete_tokens(self, fst) -> 'pynini.FstLike': + delete_space + pynutil.delete("}") ) - return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA) From 7a822f3cd6f106533448044e08df474ba2f35987 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:31:22 -0700 Subject: [PATCH 23/89] =?UTF-8?q?updated=20gramamr=20according=20to=20last?= =?UTF-8?q?=20PR.=20Removed=20the=20acceptance=20of=20=E5=8D=83?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: BuyuanCui --- .../zh/taggers/cardinal.py | 31 ++++++++----------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index 43d701f3c..8ad7a597d 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -21,7 +21,7 @@ class CardinalFst(GraphFst): def __init__(self): """ - Fitite state transducer for classifying cardinals (e.g., 负五十 -> cardinal { negative: "-"integer: "50" }) + Fitite state transducer for classifying cardinals (e.g., 负五十 -> cardinal { negative: "-" integer: "50" }) This class converts cardinals up to hundred millions (i.e., (10**10)) Single unit digits are not converted (e.g., 五 -> 五) Numbers less than 20 are not converted. @@ -31,7 +31,6 @@ def __init__(self): # number of digits to be processed delete_hundreds = pynutil.delete("百") | pynutil.delete("佰") - closure_thousands = pynini.accep("千") | pynini.accep("仟") delete_thousands = pynutil.delete("千") | pynutil.delete("仟") closure_ten_thousands = pynini.accep("萬") | pynini.accep("万") delete_ten_thousands = pynutil.delete("萬") | pynutil.delete("万") @@ -39,27 +38,22 @@ def __init__(self): delete_hundred_millions = pynutil.delete("亿") | pynutil.delete("億") # data imported - zero = pynini.string_file(get_abs_path("data/numbers/zero-nano.tsv")) + zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) digits = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) - tens = pynini.string_file(get_abs_path("data/numbers/tens-nano.tsv")) + ties = pynini.string_file(get_abs_path("data/numbers/ties-nano.tsv")) # grammar for digits graph_digits = digits | pynutil.insert("0") # grammar for teens - ten = pynini.string_map([("十", "1"), ("拾", "1")]) + ten = pynini.string_map([("十", "1"), ("拾", "1"), ("壹拾", "1"), ("壹拾", "1")]) graph_teens = ten + graph_digits graph_teens = graph_teens | pynutil.insert("0") # grammar for tens, not the output for Cardinal grammar but for pure Arabic digits (used in other grammars) - graph_tens = (tens + graph_digits) | (pynini.cross(pynini.accep("零"), "0") + graph_digits) + graph_tens = (ties + graph_digits) | (pynini.cross(pynini.accep("零"), "0") + graph_digits) graph_all = graph_tens | pynutil.insert("00") - # grammar for tens from 20 - 90 which only convert the ones with 3 Mandarin characters - tens_re = pynini.string_file(get_abs_path("data/numbers/tens_re-nano.tsv")) - digits_re = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) - graph_all_re = tens_re + digits_re - # grammar for hundreds 百 graph_hundreds_complex = (graph_digits + delete_hundreds + graph_all) | ( graph_digits + delete_hundreds + pynini.cross(pynini.closure("零"), "0") + graph_digits @@ -68,14 +62,12 @@ def __init__(self): graph_hundreds = graph_hundreds | pynutil.insert("000") # grammar for thousands 千 - graph_thousands_simple = graph_digits + closure_thousands graph_thousands_complex = ( (graph_digits + delete_thousands + graph_hundreds_complex) | (graph_digits + delete_thousands + pynini.cross(pynini.closure("零"), "0") + graph_all) | (graph_digits + delete_thousands + pynini.cross(pynini.closure("零"), "00") + graph_digits) ) - graph_thousands = graph_thousands_simple | graph_thousands_complex - graph_thousands = graph_thousands | pynutil.insert("000") + graph_thousands = graph_thousands_complex | pynutil.insert("000") # grammar for ten thousands 万 graph_ten_thousands_simple = graph_digits + closure_ten_thousands @@ -322,7 +314,10 @@ def __init__(self): graph_ten_thousands, graph_thousands, graph_hundreds, - graph_all_re, + graph_all, + graph_teens, + graph_digits, + zero, ) # combining grammar; output consists only arabic numbers @@ -361,9 +356,9 @@ def __init__(self): self.just_cardinals = graph_just_cardinals # used for other grammars # final grammar for cardinal output; tokenization - optional_minus_graph = (pynini.closure(pynutil.insert("negative: ") + pynini.cross("负", "\"-\""))) | ( - pynini.closure(pynutil.insert("negative: ") + pynini.cross("負", "\"-\"")) + optional_minus_graph = (pynini.closure(pynutil.insert("negative: ") + pynini.cross("负", '"-"'))) | ( + pynini.closure(pynutil.insert("negative: ") + pynini.cross("負", '"-"')) ) - final_graph = optional_minus_graph + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") + final_graph = optional_minus_graph + pynutil.insert('integer: "') + graph + pynutil.insert('"') final_graph = self.add_tokens(final_graph) self.fst = final_graph From 37b7be2472245538a133810a48a1fa67a36c894e Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:32:32 -0700 Subject: [PATCH 24/89] updates Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/taggers/date.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py index 190b49b4e..55e77aeba 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py @@ -29,8 +29,8 @@ def __init__(self): super().__init__(name="date", kind="classify") digits = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) # imported for year-component - months = pynini.string_file(get_abs_path("data/date/month-nano.tsv")) # imported for month-component - days = pynini.string_file(get_abs_path("data/date/day-nano.tsv")) # imported for day-component + months = pynini.string_file(get_abs_path("data/date/months.tsv")) # imported for month-component + days = pynini.string_file(get_abs_path("data/date/day.tsv")) # imported for day-component # grammar for year graph_year = ( @@ -40,15 +40,15 @@ def __init__(self): + pynini.closure(pynini.cross("零", "0")) + pynutil.delete("年") ) - graph_year = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + graph_year = pynutil.insert('year: "') + graph_year + pynutil.insert('"') # grammar for month - graph_month = pynutil.insert("month: \"") + months + pynutil.delete("月") + pynutil.insert("\"") + graph_month = pynutil.insert('month: "') + months + pynutil.delete("月") + pynutil.insert('"') # grammar for day graph_day_suffix = pynini.accep("日") | pynini.accep("号") | pynini.accep("號") graph_delete_day_suffix = pynutil.delete(graph_day_suffix) - graph_day = pynutil.insert("day: \"") + days + graph_delete_day_suffix + pynutil.insert("\"") + graph_day = pynutil.insert('day: "') + days + graph_delete_day_suffix + pynutil.insert('"') # grammar for combinations of year+month, month+day, and year+month+day graph_ymd = graph_year + pynutil.insert(" ") + graph_month + pynutil.insert(" ") + graph_day @@ -72,10 +72,10 @@ def __init__(self): graph_ad = pynutil.delete(graph_ad_prefix) graph_suffix_bc = ( - graph_bc + graph_date + pynutil.insert(" era: \"") + pynutil.insert("B.C.") + pynutil.insert("\"") + graph_bc + graph_date + pynutil.insert(' era: "') + pynutil.insert("B.C.") + pynutil.insert('"') ) graph_suffix_ad = ( - graph_ad + graph_date + pynutil.insert(" era: \"") + pynutil.insert("A.D.") + pynutil.insert("\"") + graph_ad + graph_date + pynutil.insert(' era: "') + pynutil.insert("A.D.") + pynutil.insert('"') ) graph_era = graph_suffix_bc | graph_suffix_ad From 5cf6d45c8d0e384f91c22e15a77225be66d1f2ad Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:33:16 -0700 Subject: [PATCH 25/89] updated according to last PR. Removed comma after decimal points Signed-off-by: BuyuanCui --- .../zh/taggers/decimal.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py index bd422d70e..95bfd30c8 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py @@ -52,14 +52,14 @@ def get_quantity(decimal, cardinal): ) numbers = cardinal res = ( - pynutil.insert("integer_part: \"") + pynutil.insert('integer_part: "') + numbers - + pynutil.insert("\"") - + pynutil.insert(" quantity: \"") + + pynutil.insert('"') + + pynutil.insert(' quantity: "') + suffix - + pynutil.insert("\"") + + pynutil.insert('"') ) - res = res | decimal + pynutil.insert(" quantity: \"") + suffix + pynutil.insert("\"") + res = res | decimal + pynutil.insert(' quantity: "') + suffix + pynutil.insert('"') return res @@ -77,11 +77,11 @@ def __init__(self, cardinal: GraphFst): # grammar for integer part graph_integer = ( - pynutil.insert("integer_part: \"") + pynutil.insert('integer_part: "') + (cardinal_before_decimal | (pynini.closure(pynini.cross("零", "0"), 0, 1))) - + pynutil.insert("\" ") + + pynutil.insert('" ') ) # tokenization on just numbers - graph_integer_or_none = graph_integer | pynutil.insert("integer_part: \"0\" ", weight=0.01) # integer or zero + graph_integer_or_none = graph_integer | pynutil.insert('integer_part: "0" ', weight=0.01) # integer or zero # grammar for fractional part delete_zero = pynini.closure(pynini.cross("零", "0")) @@ -89,7 +89,7 @@ def __init__(self, cardinal: GraphFst): graph_string_of_cardinals = ( pynini.closure(graph_string_of_cardinals) + delete_zero + pynini.closure(graph_string_of_cardinals) ) - graph_fractional = pynutil.insert("fractional_part: \"") + graph_string_of_cardinals + pynutil.insert("\"") + graph_fractional = pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"') # grammar for decimal: integer+delete character+part after decimal point graph_decimal_no_sign = graph_integer_or_none + delete_decimal + graph_fractional @@ -99,7 +99,7 @@ def __init__(self, cardinal: GraphFst): graph_decimal_no_sign, cardinal.just_cardinals ) - graph_negative = pynini.cross("负", "negative: \"-\" ") | pynini.cross("負", "negative: \"-\" ") + graph_negative = pynini.cross("负", 'negative: "-" ') | pynini.cross("負", 'negative: "-" ') graph_negative = pynini.closure(graph_negative, 0, 1) # captures only one "负" graph_decimal = graph_negative + graph_decimal_no_sign From eb392705672acffb1bd9241d482ad0cc767ae489 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:33:52 -0700 Subject: [PATCH 26/89] gramamr for Fraction Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/taggers/fraction.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py index 225ade447..33fcd20a9 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/fraction.py @@ -31,11 +31,11 @@ def __init__(self, cardinal: GraphFst): super().__init__(name="fraction", kind="classify") graph_cardinal = cardinal.just_cardinals - integer_component = pynutil.insert("integer_part: \"") + graph_cardinal + pynutil.insert("\"") + integer_component = pynutil.insert('integer_part: "') + graph_cardinal + pynutil.insert('"') denominator_component = ( - pynutil.insert("denominator: \"") + graph_cardinal + pynutil.delete("分之") + pynutil.insert("\"") + pynutil.insert('denominator: "') + graph_cardinal + pynutil.delete("分之") + pynutil.insert('"') ) - numerator_component = pynutil.insert("numerator: \"") + graph_cardinal + pynutil.insert("\"") + numerator_component = pynutil.insert('numerator: "') + graph_cardinal + pynutil.insert('"') graph_only_fraction = denominator_component + pynutil.insert(" ") + numerator_component graph_fraction_with_int = integer_component + pynutil.delete("又") + pynutil.insert(" ") + graph_only_fraction From 4fcda3d874e94bea4f83f0fc597b8347c76b968c Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:34:33 -0700 Subject: [PATCH 27/89] =?UTF-8?q?=20gramamr=20for=20money=20and=20updated?= =?UTF-8?q?=20according=20to=20last=20PR.=20Plus=20process=20of=20?= =?UTF-8?q?=E5=85=83?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: BuyuanCui --- .../zh/verbalizers/money.py | 43 ++++++++----------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py index d8ca03a66..9b1e8b637 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py @@ -21,22 +21,18 @@ class MoneyFst(GraphFst): def __init__(self): super().__init__(name="money", kind="verbalize") - currency_unit = pynutil.delete("currency: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - number_unit = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - fraction_unit = ( - pynutil.delete("fractional_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - ) - cent_unit = pynutil.delete("cent_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - tencent_unit = pynutil.delete("tencent_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + currency_unit = pynutil.delete('currency: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + number_unit = pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + fraction_unit = pynutil.delete('fractional_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') decimal_unit = ( pynutil.insert(".") - + pynutil.delete("fractional_part: \"") + + pynutil.delete('fractional_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") + + pynutil.delete('"') + delete_space - + pynutil.delete("quantity: \"") + + pynutil.delete('quantity: "') + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") + + pynutil.delete('"') ) # regular money part @@ -49,21 +45,20 @@ def __init__(self): graph_regular = graph_money_regular | graph_only_major_regular | graph_only_minor_regular | graph_large_money - # yuan part - graph_money_yuan = ( - currency_unit - + delete_space - + number_unit - + delete_space - + pynutil.insert(".") - + ((pynutil.insert("0") + cent_unit) | (tencent_unit) | (tencent_unit + delete_space + cent_unit)) - ) - graph_yuan_minors = ( - currency_unit + delete_space + pynutil.insert("0.") + tencent_unit + delete_space + cent_unit + major_symbol = pynini.accep("块") + minor_symbol = pynini.accep("毛") | pynini.accep("角") + lesser_symbol = pynini.accep("分") + major_currency = pynutil.delete('currency_major: "') + major_symbol + pynutil.delete('"') + minor_currency = pynutil.delete('currency: "') + minor_symbol + pynutil.delete('"') + lesser_currency = pynutil.delete('currency_minor: "') + lesser_symbol + pynutil.delete('"') + + graph_kuai = number_unit + delete_space + major_currency + graph_mao = ( + number_unit + delete_space + major_currency + delete_space + number_unit + delete_space + minor_currency ) - graph_yuan = graph_money_yuan | graph_yuan_minors + # | (number_unit + delete_space + major_currency + delete_space + number_unit + delete_space + minor_currency + delete_space + number_unit + delete_space + lesser_currency) - graph_verbalizer = graph_regular | graph_yuan + graph_verbalizer = graph_regular | pynutil.add_weight(graph_mao, -2.0) delete_tokens = self.delete_tokens(graph_verbalizer) self.fst = delete_tokens.optimize() From 60fddba7c96190dc10bacfe877f2990bdc189274 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:35:23 -0700 Subject: [PATCH 28/89] ordinal grammar. updates due to the updates in cardinal grammar Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/verbalizers/ordinal.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py index 7f65f3c69..93f2a678d 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/ordinal.py @@ -28,11 +28,11 @@ def __init__(self): graph_integer = ( pynutil.delete("integer:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.accep("第") + pynini.closure(NEMO_DIGIT) + pynini.closure(NEMO_SIGMA) - + pynutil.delete("\"") + + pynutil.delete('"') ) delete_tokens = self.delete_tokens(graph_integer) From 7374ef59af59ae53d98609b96af4b14d83a14673 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:36:43 -0700 Subject: [PATCH 29/89] updated accordingly to last PR comments. removing am and pm and allowing simple mandarin expression Signed-off-by: BuyuanCui --- .../zh/verbalizers/time.py | 104 ++++++++++++------ 1 file changed, 70 insertions(+), 34 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py index 159668f7b..e39b200d1 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py @@ -21,61 +21,97 @@ class TimeFst(GraphFst): """ Finite state transcucer for verbalizing time, e.g., time { hours: "12" minutes: "30" } -> 12:30 - time { hours: "1" minutes: "30" } -> 01:30 - time { hours: "1" minutes: "30" affix: "a.m." } -> 01:30 a.m. + time { hours: "01" minutes: "30" } -> 01:30 + time { hours: "1" minutes: "30" seconds: "05" } -> 01:30:05 + time { hours: "1" minutes: "1刻" } -> 1点1刻 + time { hours: "一点" } -> 1点 + time { hours: "一小时" } -> 1小时 + time { hours: "一个钟头" } -> 1个钟头 + time { minutes: "一分" } -> 1分 + time { minutes: "一分钟" } -> 1分钟 + time { seconds: "一秒" } -> 1秒 + time { seconds: "一秒钟" } -> 1秒钟 + time { hours: "五点" minutes: "一刻" } -> 5点1刻 """ def __init__(self): super().__init__(name="time", kind="verbalize") - add_leading_zero = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) + # add_leading_zero = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) token_hour = ( pynutil.delete("hours:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_DIGIT, 1, 2) - + pynutil.delete("\"") + + pynutil.delete('"') ) token_minute = ( pynutil.delete("minutes:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_DIGIT, 1, 2) - + pynutil.delete("\"") + + pynutil.delete('"') ) - - affix_am = ( - delete_space - + pynutil.delete("affix:") + token_second = ( + pynutil.delete("seconds:") + delete_space - + pynutil.delete("\"") - + pynini.accep("a.m.") - + pynutil.delete("\"") + + pynutil.delete('"') + + pynini.closure(NEMO_DIGIT, 1, 2) + + pynutil.delete('"') + ) + + add_colon = pynutil.insert(":") + graph_regular_time = (token_hour + delete_space + add_colon + token_minute) | ( + token_hour + delete_space + add_colon + token_minute + delete_space + add_colon + token_second ) - affix_am = pynutil.insert(" ") + pynini.closure(affix_am, 0, 1) - graph_am = token_hour @ add_leading_zero + delete_space + pynutil.insert(":") + token_minute - graph_am_affix = token_hour @ add_leading_zero + delete_space + pynutil.insert(":") + token_minute + affix_am - graph_am = graph_am | graph_am_affix - # 5:00 p.m. -> 17:00 or keep 17:00 as 17:00 - affix_pm = ( - delete_space - + pynutil.delete("affix:") + hours = ( + pynini.accep("点") + | pynini.accep("小时") + | pynini.accep("时") + | pynini.accep("个钟头") + | pynini.accep("个点") + | pynini.accep("半") + ) + hour_mandarin = ( + pynutil.delete("hours:") + + delete_space + + pynutil.delete('"') + + (pynini.closure(NEMO_DIGIT) + pynini.closure(hours, 1)) + + pynutil.delete('"') + ) + minutes = pynini.accep("分") | pynini.accep("分钟") + minute_mandarin = ( + pynutil.delete("minutes:") + delete_space - + pynutil.delete("\"") - + pynini.accep("p.m.") - + pynutil.delete("\"") + + pynutil.delete('"') + + (pynini.closure(NEMO_DIGIT) + pynini.closure(minutes, 1)) + + pynutil.delete('"') ) - graph_pm = token_hour @ add_leading_zero + delete_space + pynutil.insert(":") + token_minute - graph_pm_affix = ( - token_hour @ add_leading_zero + seconds = pynini.accep("秒") | pynini.accep("秒钟") + second_mandarin = ( + pynutil.delete("seconds:") + delete_space - + pynutil.insert(":") - + token_minute - + pynutil.insert(" ") - + affix_pm + + pynutil.delete('"') + + (pynini.closure(NEMO_DIGIT) + pynini.closure(seconds, 1)) + + pynutil.delete('"') + ) + quarters = pynini.accep("刻") | pynini.accep("刻钟") + quarter_mandarin = ( + pynutil.delete("minutes:") + + delete_space + + pynutil.delete('"') + + (pynini.closure(NEMO_DIGIT) + pynini.closure(quarters, 1)) + + pynutil.delete('"') + ) + + graph_mandarin_time = ( + hour_mandarin + | minute_mandarin + | second_mandarin + | quarter_mandarin + | (hour_mandarin + delete_space + quarter_mandarin) ) - graph_pm = graph_pm | graph_pm_affix - final_graph = graph_am | graph_pm + final_graph = graph_regular_time | graph_mandarin_time delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize() From bb7f905bed39cce5a7fc28af7296cba3a990ff8b Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:37:23 -0700 Subject: [PATCH 30/89] arrangements Signed-off-by: BuyuanCui --- .../zh/verbalizers/verbalize.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py index 05bc66f70..8a1ae31da 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# from nemo_text_processing.inverse_text_normalization.zh.verbalizers.whitelist import WhiteListFst from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.zh.verbalizers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.zh.verbalizers.date import DateFst @@ -27,6 +26,7 @@ # from nemo_text_processing.inverse_text_normalization.zh.verbalizers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.zh.verbalizers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.zh.verbalizers.whitelist import WhiteListFst class VerbalizeFst(GraphFst): @@ -49,6 +49,16 @@ def __init__(self): money_graph = money.fst time_graph = TimeFst().fst date_graph = DateFst().fst + whitelist_graph = WhiteListFst().fst - graph = time_graph | date_graph | money_graph | fraction_graph | ordinal_graph | decimal_graph | cardinal_graph + graph = ( + time_graph + | date_graph + | money_graph + | fraction_graph + | ordinal_graph + | decimal_graph + | cardinal_graph + | whitelist_graph + ) self.fst = graph From 608e98b24ab920aa57eeabc4f11bd93d29e4d478 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:37:46 -0700 Subject: [PATCH 31/89] added whitelist grammar Signed-off-by: BuyuanCui --- .../zh/verbalizers/whitelist.py | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 nemo_text_processing/inverse_text_normalization/zh/verbalizers/whitelist.py diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/whitelist.py new file mode 100644 index 000000000..994935b2b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/whitelist.py @@ -0,0 +1,42 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( + NEMO_CHAR, + NEMO_SIGMA, + GraphFst, + delete_space, +) +from pynini.lib import pynutil + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for verbalizing whitelist + e.g. tokens { name: "USB" } -> USB + """ + + def __init__(self): + super().__init__(name="whitelist", kind="verbalize") + graph = ( + pynutil.delete("name:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_CHAR - " ", 1) + + pynutil.delete('"') + ) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + self.fst = graph.optimize() From a17090b4f757515d15e28a98f3971733b45dfd94 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:38:40 -0700 Subject: [PATCH 32/89] word grammar for non-classified items Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/verbalizers/word.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py index 1456c4047..5888e2d8c 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py @@ -31,7 +31,7 @@ class WordFst(GraphFst): def __init__(self): super().__init__(name="word", kind="verbalize") chars = pynini.closure(NEMO_CHAR - " ", 1) - char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") + char = pynutil.delete("name:") + delete_space + pynutil.delete('"') + chars + pynutil.delete('"') graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() From 1d2af16c8549aed64ad201c2b04de7609190a9a0 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:39:21 -0700 Subject: [PATCH 33/89] updated cardinal, decimal, time, itn data Signed-off-by: BuyuanCui --- .../test_cases_cardinal.txt | 2 +- .../test_cases_decimal.txt | 16 ++++---- .../test_cases_time.txt | 40 +++++++++---------- 3 files changed, 28 insertions(+), 30 deletions(-) diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_cardinal.txt index 636d0e8ad..37052db4a 100644 --- a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_cardinal.txt @@ -5,7 +5,7 @@ 九百~900 九百五十~950 九百五十一~951 -一千~1千 +一千~1000 一千零一~1,001 一千一百~1,100 一千一百零一~1,101 diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_decimal.txt index fec09a461..a73dc302e 100644 --- a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_decimal.txt +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_decimal.txt @@ -3,14 +3,14 @@ 一百点零~100.0 一百零一点五~101.5 一点零五六~1.056 -一点零零五六~1.005,6 -一点零零零五六~1.000,56 +一点零零五六~1.0056 +一点零零零五六~1.00056 两百点一~200.1 三千点五~3,000.5 四万点六~40,000.6 一點零零五~1.005 九十九點零零零五~99.0005 -一百點五七三五~100.573,5 +一百點五七三五~100.5735 一千五百点零一~1,500.01 负五万点二四五~-50,000.245 负十五万点三七九~-150,000.379 @@ -23,12 +23,12 @@ 負五萬點三~-50,000.3 負五點零一~-5.01 負十點零零一~-10.001 -負十點零零零三~-10.000,3 -負一百點零零零零四~-100.000,04 -一点一二三四五六七八九~1.123,456,789 +負十點零零零三~-10.0003 +負一百點零零零零四~-100.00004 +一点一二三四五六七八九~1.123456789 负五点一零二~-5.102 -负三点一二零三~-3.120,3 -负十点一二三零五~-10.123,05 +负三点一二零三~-3.1203 +负十点一二三零五~-10.12305 伍拾壹点肆~51.4 壹佰点叁肆~100.34 贰拾点伍陆~20.56 diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_time.txt index 928f83063..922a2ea0e 100644 --- a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_time.txt @@ -1,22 +1,20 @@ 五点五分~05:05 -五点半~05:30 -五点一刻~05:15 -两点二刻~02:30 -三点三刻~03:45 -六点~06:00 -上午五点五分~05:05 a.m. -上午五点半~05:30 a.m. -上午五点一刻~05:15 a.m. -上午两点二刻~02:30 a.m. -上午三点三刻~03:45 a.m. -下午五点五分~05:05 p.m. -下午五点半~05:30 p.m. -下午两点一刻~02:15 p.m. -下午三点二刻~03:30 p.m. -下午四点~04:00 p.m. -正午~12:00 -半夜~00:00 -三点差五分~02:55 -两点差三分~01:57 -三点差四分~02:56 -四点差五分~03:55 \ No newline at end of file +五点一刻~5点1刻 +两点二刻~2点2刻 +三点三刻~3点3刻 +六点~6点 +五点五分~05:05 +五点半~5点半 +五点一刻~5点1刻 +两点三刻~2点3刻 +三点三刻~3点3刻 +五点五分~05:05 +两点一刻~2点1刻 +三点二刻~3点2刻 +四点~4点 +一点五分十秒~01:05:10 +十三点五分十秒~13:05:10 +十点~10点 +五分钟~5分钟 +五秒钟~5秒钟 +十三点五分~13:05 \ No newline at end of file From 7c9866d3beee6820fe644312e90d4a51805ee97b Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:40:10 -0700 Subject: [PATCH 34/89] updates according to last PR Signed-off-by: BuyuanCui --- .../zh/taggers/money.py | 98 +++++++------------ 1 file changed, 36 insertions(+), 62 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py index 900865c4c..c7eaa18a6 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py @@ -23,12 +23,12 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") # imports - minor_currency_cent = pynini.string_file(get_abs_path("data/money/currency_rmb_minor_cent-nano.tsv")) - minor_currency_tencent = pynini.string_file(get_abs_path("data/money/currency_rmb_minor_tencent-nano.tsv")) + minor_currency_cent = pynini.string_file(get_abs_path("data/money/currency_rmb_minor_cent.tsv")) + minor_currency_tencent = pynini.string_file(get_abs_path("data/money/currency_rmb_minor_tencent.tsv")) minor_digit = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) - zero = pynini.string_file(get_abs_path("data/numbers/zero-nano.tsv")) - major_currency = pynini.string_file(get_abs_path("data/money/currency_major-nano.tsv")) # - minor_currency = pynini.string_file(get_abs_path("data/money/currency_minor-nano.tsv")) # + zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + major_currency = pynini.string_file(get_abs_path("data/money/currency_major.tsv")) # + minor_currency = pynini.string_file(get_abs_path("data/money/currency_minor.tsv")) # graph_cardinal = cardinal.for_ordinals graph_decimal = decimal.final_graph_wo_negative # fraction_integer = minor_digit | zero @@ -38,24 +38,21 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): graph_fractional_values = graph_cardinal @ add_leading_zero_to_double_digit # # regular number and yuan part - graph_integer_component = pynutil.insert("integer_part: \"") + graph_cardinal + pynutil.insert("\"") + graph_integer_component = pynutil.insert('integer_part: "') + graph_cardinal + pynutil.insert('"') graph_fractional_component = ( - pynutil.insert("fractional_part: \"") - + graph_fractional_values - + pynutil.insert("\"") - + pynutil.delete(minor_currency) - ) - graph_fractional_component_ex = ( - pynutil.insert("fractional_part: \"") + graph_fractional_values + pynutil.insert("\"") + pynutil.insert('fractional_part: "') + graph_fractional_values + pynutil.insert('"') ) + # graph_fractional_component_ex = ( + # pynutil.insert("fractional_part: \"") + graph_fractional_values + pynutil.insert("\"") + # ) # regular symbol part - graph_major_currency = pynutil.insert("currency: \"") + major_currency + pynutil.insert("\"") - graph_minor_currency = pynutil.insert("currency: \"") + minor_currency + pynutil.insert("\"") + graph_major_currency = pynutil.insert('currency: "') + major_currency + pynutil.insert('"') + graph_minor_currency = pynutil.insert('currency: "') + minor_currency + pynutil.insert('"') # regular combine number and symbol part graph_only_major = graph_integer_component + pynutil.insert(" ") + graph_major_currency - graph_only_minor = graph_fractional_component_ex + pynutil.insert(" ") + graph_minor_currency + graph_only_minor = graph_fractional_component + pynutil.insert(" ") + graph_minor_currency graph_money = graph_only_major + pynutil.insert(" ") + graph_fractional_component # regular large money with decimals @@ -64,63 +61,40 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): # final graph for regular currency graph_regular_money = graph_only_major | graph_only_minor | graph_money | graph_large_money - # yuan number part - graph_cent_fractional_comp = pynutil.insert("cent_part: \"") + fraction_integer + pynutil.insert("\"") - graph_tencent_fractional_comp = pynutil.insert("tencent_part: \"") + fraction_integer + pynutil.insert("\"") - - # yuan symbol part - graph_currency_minor_cent = pynutil.insert("currency: \"") + minor_currency_cent + pynutil.insert("\"") - graph_currency_minor_tencent = pynutil.insert("currency: \"") + minor_currency_tencent + pynutil.insert("\"") - - # yuan combine number and symbol part - graph_only_cent = graph_cent_fractional_comp + pynutil.insert(" ") + graph_currency_minor_cent - graph_only_tencent = graph_tencent_fractional_comp + pynutil.insert(" ") + graph_currency_minor_tencent - # yuan major plus minor - symbols = pynini.union('元', '毛', '角', '分') - delete_symbols = pynutil.delete(symbols) - graph_major_cent = ( + major_symbol = pynini.accep("块") | pynini.cross("塊", "块") + tencent = pynini.accep("毛") | pynini.accep("角",) + cent = pynini.accep("分") + graph_kuai = ( graph_integer_component - + delete_symbols + pynutil.insert(" ") - + graph_cent_fractional_comp - + pynutil.insert(" ") - + graph_currency_minor_cent + + pynutil.insert('currency_major: "') + + pynini.closure(major_symbol, 1, 1) + + pynutil.insert('"') ) - graph_major_tencent = ( + graph_mao = ( graph_integer_component - + delete_symbols - + pynutil.insert(" ") - + graph_tencent_fractional_comp + pynutil.insert(" ") - + graph_currency_minor_tencent + + pynutil.insert('currency: "') + + pynini.closure(tencent, 1, 1) + + pynutil.insert('"') ) - graph_tencent_cent = ( - graph_tencent_fractional_comp - + delete_symbols - + pynutil.insert(" ") - + graph_cent_fractional_comp - + pynutil.insert(" ") - + graph_currency_minor_cent - ) - graph_major_minor = ( + graph_fen = ( graph_integer_component - + delete_symbols - + pynutil.insert(" ") - + graph_tencent_fractional_comp + pynutil.insert(" ") - + delete_symbols - + graph_cent_fractional_comp - + pynutil.insert(" ") - + graph_currency_minor_cent + + pynutil.insert('currency_minor: "') + + pynini.closure(cent, 1, 1) + + pynutil.insert('"') + ) + graph_kuaimao = graph_kuai + pynutil.insert(" ") + graph_mao + graph_kuaifen = graph_kuai + pynutil.insert(" ") + graph_fen + graph_maofen = graph_mao + pynutil.insert(" ") + graph_fen + graph_kuaimaofen = graph_kuai + pynutil.insert(" ") + graph_mao + pynutil.insert(" ") + graph_fen + graph_mandarin = ( + graph_kuai | graph_mao | graph_fen | graph_kuaimao | graph_kuaifen | graph_maofen | graph_kuaimaofen ) - - # final graph for yuan - graph_yuan_only = graph_only_cent | graph_only_tencent - graph_yuan_comb = graph_major_cent | graph_major_tencent | graph_tencent_cent | graph_major_minor # combing both - graph_yuan = graph_yuan_only | graph_yuan_comb - graph_final = graph_regular_money | graph_yuan + graph_final = graph_regular_money | graph_mandarin final = self.add_tokens(graph_final) self.fst = final.optimize() From 7a5e8dfd6e0a5a2f8ad0cce75cfca204e3e882a0 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:40:38 -0700 Subject: [PATCH 35/89] updates according to the updates for cardinal grammar Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/taggers/ordinal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py index 2b044d5b7..b1eacaa09 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py @@ -25,6 +25,6 @@ def __init__(self, cardinal: GraphFst): graph_cardinals = cardinal.for_ordinals mandarin_morpheme = pynini.accep("第") graph_ordinal = mandarin_morpheme + graph_cardinals - graph_ordinal_final = pynutil.insert("integer: \"") + graph_ordinal + pynutil.insert("\"") + graph_ordinal_final = pynutil.insert('integer: "') + graph_ordinal + pynutil.insert('"') graph_ordinal_final = self.add_tokens(graph_ordinal_final) self.fst = graph_ordinal_final.optimize() From d4f958537386d7d05a8d72687ded73d9fc0af8fd Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:41:10 -0700 Subject: [PATCH 36/89] updates for more Mandarin punctuations Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/taggers/punctuation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py index 74c098ed9..4ca8eab9b 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/punctuation.py @@ -26,9 +26,9 @@ class PunctuationFst(GraphFst): def __init__(self): super().__init__(name="punctuation", kind="classify") - s = "!#$%&\'()*+,-./:;<=>?@^_`{|}~。,;:《》“”·~【】!?、‘’.<>-——_" + s = "!#$%&'()*+,-./:;<=>?@^_`{|}~。,;:《》“”·~【】!?、‘’.<>-——_" punct = pynini.union(*s) - graph = pynutil.insert("name: \"") + punct + pynutil.insert("\"") + graph = pynutil.insert('name: "') + punct + pynutil.insert('"') self.fst = graph.optimize() From d4d15555089b1fcce635e3f3a62bac65abc43474 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:41:41 -0700 Subject: [PATCH 37/89] updated accordingly to last PR. removing am pm Signed-off-by: BuyuanCui --- .../zh/taggers/time.py | 176 +++++++----------- 1 file changed, 65 insertions(+), 111 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py index c6fd4e436..120ba485b 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py @@ -22,130 +22,84 @@ class TimeFst(GraphFst): """ Finite state transducer for classifying time - e.g., 五d点 -> time { hours: "5" minutes: "00" } - e.g., 正午 -> time { hours: "12" minutes: "00" } - e.g., 两点一刻 -> time { hours: "2" minutes: "15" } - e.g., 上午九点 -> time { hours: "2" minutes: "00" affix: "a.m." } - e.g., 五点差五分 -> time { hours: "4" minutes: "55"} + e.g., 五点十分 -> time { hours: "05" minutes: "10" } + e.g., 五时十五分 -> time { hours: "05" minutes: "15" } + e.g., 十五点十分 -> time { hours: "15" minutes: "10" } + e.g., 十五点十分二十秒 -> time { hours: "15" minutes: "10" seconds: "20 } + e.g., 两点一刻 -> time { hours: "2" minutes: "1刻" } + e.g., 五点 -> time { hours: "5点" } + e.g., 五小时 -> time { hours: "5小时" } + e.g., 五分 -> time { minutess: "5分" } + e.g., 五分钟 -> time { seconds: "5分钟" } + e.g., 五秒 -> time { seconds: "5秒" } + e.g., 五秒钟 -> time { seconds: "5秒钟" } """ def __init__(self): super().__init__(name="time", kind="classify") - # data imported - hours = pynini.string_file(get_abs_path("data/time/time_hours-nano.tsv")) # hours from 1 to 24 - minutes = pynini.string_file(get_abs_path("data/time/time_minutes-nano.tsv")) # minutes from 1 to 60 - hours_to = pynini.string_file(get_abs_path("data/time/hours_to-nano.tsv")) # used for back counting, see below - minutes_to = pynini.string_file( - get_abs_path("data/time/minutes_to-nano.tsv") - ) # used for back counting, see below + hours = pynini.string_file(get_abs_path("data/time/time_hours.tsv")) + minutes = pynini.string_file(get_abs_path("data/time/time_minutes.tsv")) + seconds = pynini.string_file(get_abs_path("data/time/time_seconds.tsv")) + quarters = pynini.string_file(get_abs_path("data/time/time_quarters.tsv")) + for_mandarin = pynini.string_file(get_abs_path("data/time/time_mandarin.tsv")) - # graph for one quarter (e.g., 10:15) - graph_quarter = pynini.accep("一刻") | pynini.accep("壹刻") | pynini.accep("幺刻") - graph_quarter = pynini.cross(graph_quarter, "15") - - # grammar for two quarters or half (e.g., 10:30) - graph_half = pynini.accep("半").ques - graph_half = pynini.cross(graph_half, "30") - graph_half_alt = pynini.accep("二刻") | pynini.accep("貳刻") | pynini.accep("两刻") | pynini.accep("兩刻") - graph_half_alt = pynini.cross(graph_half_alt, "30") - graph_half = graph_half | graph_half_alt - - # grammar for three quarters (e.g., 10:45) - graph_three_quarter = pynini.accep("三刻", "45") | pynini.accep("叁刻", "45") - graph_three_quarter = pynini.cross(graph_three_quarter, "45") - - # combining grammars quarter, two quater, and three quarter - graph_fractions = graph_quarter | graph_half | graph_three_quarter - - # graph for "Noon-12PM" - graph_noon = pynini.cross("中午", "12") | pynini.cross("正午", "12") | pynini.cross("午间", "12") - graph_midnight = pynini.cross("午夜", "0") | pynini.cross("半夜", "0") | pynini.cross("子夜", "0") - - # graph for hour - graph_delete_hours = ( - pynutil.delete("点") | pynutil.delete("點") | pynutil.delete("时") | pynutil.delete("時") - ) # "点": Mandarin for "hour | o'clock" (e.g.,十点=ten o' clock) + graph_delete_hours = pynutil.delete("点") | pynutil.delete("點") | pynutil.delete("时") | pynutil.delete("時") graph_hours = hours + graph_delete_hours + graph_hours_component = pynutil.insert('hours: "') + graph_hours + pynutil.insert('"') - # graph for minutes - graph_minutes = pynutil.delete('分') + graph_minutes = pynutil.delete("分") graph_minutes = minutes + graph_minutes - - # add tokenization for hours position component - graph_hours_component = pynini.union(graph_hours, graph_noon, graph_midnight) # what to put at hours-position - graph_hours_component = pynutil.insert("hours: \"") + graph_hours_component + pynutil.insert("\"") - - # add tokenization for minutes position component - graph_minutes_component = pynini.union(graph_minutes, graph_fractions) | pynutil.insert( - "00" - ) # what to put at minutes-position - graph_minutes_component = pynutil.insert(" minutes: \"") + graph_minutes_component + pynutil.insert("\"") - graph_minutes_component = delete_space + graph_minutes_component - - # combine two above to process digit + "hours" + digit " minutes/null" (e.g., 十点五十分/十点五十-> hours: "10" minutes: "50") - graph_time_standard = graph_hours_component + graph_minutes_component - - # combined hours and minutes but with prefix - graph_time_standard_affix = ( - ( - (pynutil.delete("上午") | pynutil.delete("早上")) - + graph_time_standard - + pynutil.insert(" affix: \"") - + pynutil.insert("a.m.") - + pynutil.insert("\"") - ) - ) | ( - ( - (pynutil.delete("下午") | pynutil.delete("晚上")) - + graph_time_standard - + pynutil.insert(" affix: \"") - + pynutil.insert("p.m.") - + pynutil.insert("\"") - ) + graph_minutes_component = pynutil.insert('minutes: "') + graph_minutes + pynutil.insert('"') + + graph_seconds = pynutil.delete("秒") + graph_seconds = seconds + graph_seconds + graph_seconds_component = pynutil.insert('seconds: "') + graph_seconds + pynutil.insert('"') + + graph_time_standard = (graph_hours_component + pynutil.insert(" ") + graph_minutes_component) | ( + graph_hours_component + + pynutil.insert(" ") + + graph_minutes_component + + pynutil.insert(" ") + + graph_seconds_component ) - # combined hours and minutes (上午十點五十-> hours: "10" minutes: "50" affix: "a.m.") - graph_time_standard = graph_time_standard | graph_time_standard_affix - - # grammar for back-counting - # converting hours back - graph_hours_to_component = graph_hours | graph_noon | graph_midnight # | graph_hours_count - graph_hours_to_component @= hours_to # hours_to is the string_file data - graph_hours_to_component = pynutil.insert("hours: \"") + graph_hours_to_component + pynutil.insert("\"") - - # converting minutes back - graph_minutes_to_component = minutes | graph_half | graph_quarter | graph_three_quarter | graph_half_alt - graph_minutes_to_component @= minutes_to # minutes_to is the string_file data - graph_minutes_to_component = pynutil.insert(" minutes: \"") + graph_minutes_to_component + pynutil.insert("\"") - - graph_delete_back_counting = pynutil.delete("差") | pynutil.delete("还有") | pynutil.delete("還有") - graph_delete_minutes = pynutil.delete("分") | pynutil.delete("分钟") | pynutil.delete("分鐘") - - # adding a.m. and p.m. - graph_time_to = ( - graph_hours_to_component + graph_delete_back_counting + graph_minutes_to_component + graph_delete_minutes + quarter_mandarin = ( + quarters + pynini.accep("刻") | pynini.cross("刻鈡", "刻钟") | pynini.accep("刻钟") | pynini.accep("半") + ) + hour_mandarin = ( + pynini.accep("点") + | pynini.accep("时") + | pynini.cross("點", "点") + | pynini.cross("時", "时") + | pynini.accep("小时") + | pynini.cross("小時", "小时") + | pynini.cross("個點", "个点") + | pynini.accep("个点") + | pynini.accep("个钟头") + | pynini.cross("個鐘頭", "个钟头") + | pynini.accep("个小时") + | pynini.cross("個小時", "个小时") ) - graph_time_to_affix = ( - ( - (pynutil.delete("上午") | pynutil.delete("早上")) - + graph_time_to - + pynutil.insert(" affix: \"") - + pynutil.insert("a.m.") - + pynutil.insert("\"") - ) - ) | ( - ( - (pynutil.delete("下午") | pynutil.delete("晚上")) - + graph_time_to - + pynutil.insert(" prefix: \"") - + pynutil.insert("p.m.") - + pynutil.insert("\"") - ) + minute_mandarin = pynini.accep("分") | pynini.cross("分鐘", "分钟") | pynini.accep("分钟") + second_mandarin = pynini.accep("秒") | pynini.cross("秒鐘", "秒钟") + + hours_only = for_mandarin + hour_mandarin + minutes_only = for_mandarin + minute_mandarin + seconds_only = for_mandarin + second_mandarin + + graph_mandarin_hour = pynutil.insert('hours: "') + hours_only + pynutil.insert('"') + graph_mandarin_minute = pynutil.insert('minutes: "') + minutes_only + pynutil.insert('"') + graph_mandarin_second = pynutil.insert('seconds: "') + seconds_only + pynutil.insert('"') + graph_mandarin_quarter = pynutil.insert('minutes: "') + quarter_mandarin + pynutil.insert('"') + graph_mandarins = ( + graph_mandarin_hour + | graph_mandarin_minute + | graph_mandarin_second + | graph_mandarin_quarter + | (graph_mandarin_hour + pynutil.insert(" ") + graph_mandarin_quarter) ) - graph_time_to = graph_time_to | graph_time_to_affix - # final grammar - final_graph = graph_time_standard | graph_time_to + final_graph = graph_time_standard | graph_mandarins final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() From c25badaf0c100b8d82e28d6904a6fe582fbafe91 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:42:03 -0700 Subject: [PATCH 38/89] adjustment on the weight Signed-off-by: BuyuanCui --- .../zh/taggers/tokenize_and_classify.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py index 6cf3ab137..70018a8c1 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py @@ -17,6 +17,7 @@ import pynini from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( + INPUT_LOWER_CASED, GraphFst, delete_extra_space, delete_space, @@ -36,8 +37,7 @@ # from nemo_text_processing.inverse_text_normalization.zh.taggers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.zh.taggers.time import TimeFst - -# from nemo_text_processing.inverse_text_normalization.zh.taggers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.zh.taggers.whitelist import WhiteListFst from nemo_text_processing.inverse_text_normalization.zh.taggers.word import WordFst from pynini.lib import pynutil @@ -53,7 +53,13 @@ class ClassifyFst(GraphFst): overwrite_cache: set to True to overwrite .far files """ - def __init__(self, cache_dir: str = None, whitelist: str = None, overwrite_cache: bool = False): + def __init__( + self, + input_case: INPUT_LOWER_CASED, + cache_dir: str = None, + whitelist: str = None, + overwrite_cache: bool = False, + ): super().__init__(name="tokenize_and_classify", kind="classify") far_file = None @@ -81,16 +87,18 @@ def __init__(self, cache_dir: str = None, whitelist: str = None, overwrite_cache fraction = FractionFst(cardinal) fraction_graph = fraction.fst punct_graph = PunctuationFst().fst + whitelist_graph = WhiteListFst(input_file=whitelist, input_case=input_case).fst classify = ( pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(date_graph, 1.09) | pynutil.add_weight(decimal_graph, 1.2) - | pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(cardinal_graph, 1.09) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.1) | pynutil.add_weight(word_graph, 100) + | pynutil.add_weight(whitelist_graph, 1.01) ) punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") From b5c8497fbc4369c505b94e6334267838ccde9d5d Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:42:30 -0700 Subject: [PATCH 39/89] updated accordingly to the targger updates Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/verbalizers/cardinal.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py index 85f696fc7..3eec1a88b 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py @@ -72,9 +72,9 @@ def __init__(self): optional_sign = pynini.closure( pynutil.delete("negative: ") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.accep("-") - + pynutil.delete("\"") + + pynutil.delete('"') + delete_space ) @@ -82,10 +82,10 @@ def __init__(self): graph = ( pynutil.delete("integer:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_DIGIT, 0, 1) + pynini.closure(NEMO_SIGMA) - + pynutil.delete("\"") + + pynutil.delete('"') ) graph = graph @ group_by_threes From 2113a7dac56765295a109526df5be32725e028be Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:42:56 -0700 Subject: [PATCH 40/89] updated accordingly to the time tagger Signed-off-by: BuyuanCui --- .../zh/verbalizers/date.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py index dc44e45e8..2b979e6b8 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/date.py @@ -32,30 +32,30 @@ def __init__(self): year = ( pynutil.delete("year:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE, 1) + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') ) month = ( pynutil.delete("month:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE, 1) + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') ) day = ( pynutil.delete("day:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE, 1) + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') ) era = pynutil.delete("era:") - bc = era + delete_space + pynutil.delete("\"") + pynini.cross("A.D.", "公元") + pynutil.delete("\"") - ad = era + delete_space + pynutil.delete("\"") + pynini.cross("B.C.", "公元前") + pynutil.delete("\"") + bc = era + delete_space + pynutil.delete('"') + pynini.cross("A.D.", "公元") + pynutil.delete('"') + ad = era + delete_space + pynutil.delete('"') + pynini.cross("B.C.", "公元前") + pynutil.delete('"') # combining above 3 for variations graph_ymd = ( From 785cbb753fca3ff9839d5a2ffa1592cf531bdded Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:43:27 -0700 Subject: [PATCH 41/89] updates according to changes in tagger on am and pm Signed-off-by: BuyuanCui --- .../zh/verbalizers/decimal.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py index f37d495eb..ea8fa4ab0 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py @@ -45,9 +45,9 @@ def __init__(self): optional_sign = pynini.closure( pynutil.delete("negative: ") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.accep("-") - + pynutil.delete("\"") + + pynutil.delete('"') + delete_space ) @@ -55,9 +55,9 @@ def __init__(self): integer = ( pynutil.delete("integer_part:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") + + pynutil.delete('"') ) integer = integer @ group_by_threes optional_integer = pynini.closure(integer + delete_space, 0, 1) @@ -67,20 +67,19 @@ def __init__(self): pynutil.insert(".") + pynutil.delete("fractional_part:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") + + pynutil.delete('"') ) - fractional = fractional @ group_by_threes optional_fractional = pynini.closure(fractional + delete_space, 0, 1) # removing tokenization, 'quantity:' quantity = ( pynutil.delete("quantity:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") + + pynutil.delete('"') ) optional_quantity = pynini.closure(quantity + delete_space) From ceae274493b30095baf86ab57519fd0d9130778b Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:44:03 -0700 Subject: [PATCH 42/89] verbalizer for fraction Signed-off-by: BuyuanCui --- .../zh/verbalizers/fraction.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py index 4eaab1aa1..d5ea2ced1 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/fraction.py @@ -30,25 +30,25 @@ def __init__(self): integer_part = ( pynutil.delete("integer_part:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_DIGIT) + pynutil.insert("又") - + pynutil.delete("\"") + + pynutil.delete('"') ) denominator_part = ( pynutil.delete("denominator:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_DIGIT) - + pynutil.delete("\"") + + pynutil.delete('"') ) numerator_part = ( pynutil.delete("numerator:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_DIGIT) + pynutil.insert("/") - + pynutil.delete("\"") + + pynutil.delete('"') ) graph_with_integer = integer_part + delete_space + numerator_part + delete_space + denominator_part From aeae3794eddc875e7ece3b5b0565ab11a4af914d Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:44:31 -0700 Subject: [PATCH 43/89] added for mandarin grammar Signed-off-by: BuyuanCui --- .../zh/taggers/whitelist.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py new file mode 100644 index 000000000..118f232ac --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py @@ -0,0 +1,47 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os + +import pynini +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, + GraphFst, + convert_space, + string_map_cased, +) +from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path +from pynini.lib import pynutil + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for classifying whitelisted tokens + e.g. 贵宾 -> tokens { name: "VIP" } + 美国研究生入学考试 -> { name: "GRE" } + 人力资源 -> { name: "HR" } + 工商管理学硕士 -> { name: "MBA" } + This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv". + """ + + def __init__(self, input_case: str = INPUT_LOWER_CASED, input_file: str = None): + super().__init__(name="whitelist", kind="classify") + + whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")) + graph = (pynutil.insert('name: "')) + (whitelist) + pynutil.insert('"') + + self.fst = graph.optimize() From 5852b417d7f026522833631d98e369c632d2660a Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 14:45:57 -0700 Subject: [PATCH 44/89] kept this file because using English utils results in data namin error Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/utils.py | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 nemo_text_processing/inverse_text_normalization/zh/utils.py diff --git a/nemo_text_processing/inverse_text_normalization/zh/utils.py b/nemo_text_processing/inverse_text_normalization/zh/utils.py new file mode 100644 index 000000000..d63a1b2f7 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/utils.py @@ -0,0 +1,62 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import List, Union + +import inflect + +_inflect = inflect.engine() + + +def num_to_word(x: Union[str, int]): + """ + converts integer to spoken representation + + Args + x: integer + + Returns: spoken representation + """ + if isinstance(x, int): + x = str(x) + x = _inflect.number_to_words(str(x)).replace("-", " ").replace(",", "") + return x + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + "/" + rel_path + + +def get_various_formats(text: str) -> List[str]: + """ + Return various formats for text, e.g., all caps, the first letter upper cased, space separated, etc. + """ + result = [] + if len(text) == 0: + return [] + + for t in [text, " ".join(list(text))]: + result.append(t) + result.append(t.upper()) + result.append(t.capitalize()) + return result From d018a0c9ad93fc98de8c4d1390b225cb26345fe9 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 25 May 2023 15:29:06 -0700 Subject: [PATCH 45/89] merge conflict Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/graph_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py index 8ec83f113..798ffc189 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -55,7 +55,7 @@ INPUT_LOWER_CASED = "lower_cased" -def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): +def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): """ Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. From c72c7cb1035f8157e69bbfc45d88f1a0e0a4c50b Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Mon, 29 May 2023 10:07:57 -0700 Subject: [PATCH 46/89] removed unsed imports Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/taggers/time.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py index 120ba485b..240ac03ee 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py @@ -14,7 +14,7 @@ import pynini -from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path from pynini.lib import pynutil From 5a363e2ccfc527d51166609c63b2e341d0aef376 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Mon, 29 May 2023 10:09:48 -0700 Subject: [PATCH 47/89] deleted unsed import os Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/taggers/whitelist.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py index 118f232ac..d668f2d77 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py @@ -14,8 +14,6 @@ # limitations under the License. -import os - import pynini from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( INPUT_CASED, From 8a8b1df6d6877036b8f599fdd7a03b785e7d952a Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Mon, 29 May 2023 10:12:53 -0700 Subject: [PATCH 48/89] deleted unsed variables Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/taggers/money.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py index c7eaa18a6..638272683 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py @@ -23,15 +23,10 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") # imports - minor_currency_cent = pynini.string_file(get_abs_path("data/money/currency_rmb_minor_cent.tsv")) - minor_currency_tencent = pynini.string_file(get_abs_path("data/money/currency_rmb_minor_tencent.tsv")) - minor_digit = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) - zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) major_currency = pynini.string_file(get_abs_path("data/money/currency_major.tsv")) # minor_currency = pynini.string_file(get_abs_path("data/money/currency_minor.tsv")) # graph_cardinal = cardinal.for_ordinals graph_decimal = decimal.final_graph_wo_negative # - fraction_integer = minor_digit | zero # add leding zero to the number: 1 -> 01 add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) # @@ -42,9 +37,6 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): graph_fractional_component = ( pynutil.insert('fractional_part: "') + graph_fractional_values + pynutil.insert('"') ) - # graph_fractional_component_ex = ( - # pynutil.insert("fractional_part: \"") + graph_fractional_values + pynutil.insert("\"") - # ) # regular symbol part graph_major_currency = pynutil.insert('currency: "') + major_currency + pynutil.insert('"') From 434f041895a4e1c6ca9b785f3e575d7784d81c24 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Mon, 29 May 2023 10:13:15 -0700 Subject: [PATCH 49/89] removed unsed imports Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/taggers/whitelist.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py index d668f2d77..5c892b220 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py @@ -16,11 +16,8 @@ import pynini from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( - INPUT_CASED, INPUT_LOWER_CASED, GraphFst, - convert_space, - string_map_cased, ) from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path from pynini.lib import pynutil From 5278e98fb487188d25c331c2d0694640f0c93f7f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 29 May 2023 17:15:33 +0000 Subject: [PATCH 50/89] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/zh/taggers/whitelist.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py index 5c892b220..8e0cbd328 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/whitelist.py @@ -15,10 +15,7 @@ import pynini -from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( - INPUT_LOWER_CASED, - GraphFst, -) +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import INPUT_LOWER_CASED, GraphFst from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path from pynini.lib import pynutil From 40b6bc9c4a461e519106198acad843cddef80ac9 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Wed, 31 May 2023 13:33:59 -0700 Subject: [PATCH 51/89] updates and edits based on pr checks Signed-off-by: BuyuanCui --- .../zh/taggers/money.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py index 638272683..a85b55200 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py @@ -23,8 +23,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") # imports - major_currency = pynini.string_file(get_abs_path("data/money/currency_major.tsv")) # - minor_currency = pynini.string_file(get_abs_path("data/money/currency_minor.tsv")) # + major_currency = pynini.string_file(get_abs_path("data/money/currency_major.tsv")) + minor_currency = pynini.string_file(get_abs_path("data/money/currency_minor.tsv")) + digits = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) graph_cardinal = cardinal.for_ordinals graph_decimal = decimal.final_graph_wo_negative # @@ -67,7 +68,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): graph_mao = ( graph_integer_component + pynutil.insert(" ") - + pynutil.insert('currency: "') + + pynutil.insert('currency_minor: "') + pynini.closure(tencent, 1, 1) + pynutil.insert('"') ) @@ -78,10 +79,16 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + pynini.closure(cent, 1, 1) + pynutil.insert('"') ) - graph_kuaimao = graph_kuai + pynutil.insert(" ") + graph_mao - graph_kuaifen = graph_kuai + pynutil.insert(" ") + graph_fen - graph_maofen = graph_mao + pynutil.insert(" ") + graph_fen - graph_kuaimaofen = graph_kuai + pynutil.insert(" ") + graph_mao + pynutil.insert(" ") + graph_fen + + ### + #graph_kuaimao = graph_kuai + pynutil.insert(" ") + graph_mao + graph_digits = pynutil.insert('fractional_part: "') + digits + pynutil.insert('"') + graph_kuaimao = graph_kuai + pynutil.insert(" ") + graph_digits + pynutil.insert(" ") + pynutil.insert('currency_minor: "') + pynini.closure(tencent, 1, 1) + pynutil.insert('"') + graph_kuaifen = graph_kuai + pynutil.insert(" ") + graph_digits + pynutil.insert(" ") + pynutil.insert('currency_minor: "') + pynini.closure(cent, 1, 1) + pynutil.insert('"') + graph_maofen = pynutil.insert('fractional_part: "') + digits + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('currency_minor: "') + pynini.closure(tencent, 1, 1) + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('fraction_part: "') + digits + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('currency_min: "') + pynini.closure(cent, 1, 1) + pynutil.insert('"') + + graph_kuaimaofen = graph_kuai + pynutil.insert(" ") + pynutil.insert('fractional_part: "') + digits + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('currency_minor: "') + pynini.closure(tencent, 1, 1) + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('fraction_part: "') + digits + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('currency_min: "') + pynini.closure(cent, 1, 1) + pynutil.insert('"') + graph_mandarin = ( graph_kuai | graph_mao | graph_fen | graph_kuaimao | graph_kuaifen | graph_maofen | graph_kuaimaofen ) From 03fb6f06fad8c98198de2b96cfbd3db77cb37984 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Wed, 31 May 2023 13:34:18 -0700 Subject: [PATCH 52/89] updates and edits based on pr checks Signed-off-by: BuyuanCui --- .../zh/verbalizers/money.py | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py index 9b1e8b637..c3522daa2 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py @@ -13,7 +13,7 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, NEMO_DIGIT from pynini.lib import pynutil @@ -48,17 +48,25 @@ def __init__(self): major_symbol = pynini.accep("块") minor_symbol = pynini.accep("毛") | pynini.accep("角") lesser_symbol = pynini.accep("分") + major_currency = pynutil.delete('currency_major: "') + major_symbol + pynutil.delete('"') - minor_currency = pynutil.delete('currency: "') + minor_symbol + pynutil.delete('"') - lesser_currency = pynutil.delete('currency_minor: "') + lesser_symbol + pynutil.delete('"') + minor_currency = pynutil.delete('currency_minor: "') + minor_symbol + pynutil.delete('"') + lesser_currency = pynutil.delete('currency_min:"') + lesser_symbol + pynutil.delete('"') graph_kuai = number_unit + delete_space + major_currency - graph_mao = ( - number_unit + delete_space + major_currency + delete_space + number_unit + delete_space + minor_currency - ) - # | (number_unit + delete_space + major_currency + delete_space + number_unit + delete_space + minor_currency + delete_space + number_unit + delete_space + lesser_currency) + graph_mao = number_unit + delete_space + minor_currency + graph_mao = number_unit + delete_space + minor_currency + graph_fen = number_unit + delete_space + lesser_currency + + graph_kuaimao = graph_kuai + delete_space + fraction_unit + delete_space + minor_currency + graph_kuaifen = graph_kuai + delete_space + fraction_unit + delete_space + lesser_currency + graph_maofen = pynutil.delete('fractional_part: "') + pynini.closure(NEMO_DIGIT, 1) + pynutil.delete('"') + delete_space + pynutil.delete('currency_minor: "') + minor_symbol + pynutil.delete('"') + delete_space + pynutil.delete('fraction_part: "') + pynini.closure(NEMO_DIGIT, 1) + pynutil.delete('"') + delete_space + pynutil.delete('currency_min: "') + lesser_symbol + pynutil.delete('"') + + graph_all = graph_kuai + delete_space + graph_maofen + + graph_mandarin = (graph_kuai | graph_mao | graph_fen) | graph_kuaimao | graph_kuaifen | graph_maofen | graph_all - graph_verbalizer = graph_regular | pynutil.add_weight(graph_mao, -2.0) + graph_verbalizer = graph_regular | pynutil.add_weight(graph_mandarin, -2.0) delete_tokens = self.delete_tokens(graph_verbalizer) self.fst = delete_tokens.optimize() From 91fa0d407330dbc5d804568ffbf15b1d18d46cf5 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Wed, 31 May 2023 13:45:26 -0700 Subject: [PATCH 53/89] format issue, reccreated Signed-off-by: BuyuanCui --- .../zh/data/date/day.tsv | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/date/day.tsv diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/date/day.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/date/day.tsv new file mode 100644 index 000000000..fd3e3ddab --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/date/day.tsv @@ -0,0 +1,74 @@ +一 1 +二 2 +三 3 +四 4 +五 5 +六 6 +七 7 +八 8 +九 9 +十 10 +十一 11 +十二 12 +十三 13 +十四 14 +十五 15 +十六 16 +十七 17 +十八 18 +十九 19 +二十 20 +二十一 21 +二十二 22 +二十三 23 +二十四 24 +二十五 25 +二十六 26 +二十七 27 +二十八 28 +二十九 29 +三十 30 +三十一 31 +壹 1 +貳 2 +參 3 +肆 4 +伍 5 +陸 6 +柒 7 +捌 8 +玖 9 +幺 1 +两 2 +兩 2 +拾 10 +拾壹 11 +拾貳 12 +拾叁 13 +拾肆 14 +拾伍 15 +拾陸 16 +拾柒 17 +拾捌 18 +拾玖 19 +貳拾 20 +貳拾壹 21 +貳拾貳 22 +貳拾叁 23 +貳拾肆 24 +貳拾伍 25 +貳拾陸 26 +貳拾柒 27 +貳拾捌 28 +貳拾玖 29 +叁拾 30 +叁拾壹 31 +壹 1 +拾壹 11 +贰拾壹 21 +贰 2 +陆 6 +拾贰 12 +拾陆 16 +贰拾贰 22 +贰拾陆 26 \ No newline at end of file From 130b35155f44db0bc4932f8408cac3080aa38298 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Wed, 31 May 2023 13:45:45 -0700 Subject: [PATCH 54/89] format issue recreated Signed-off-by: BuyuanCui --- .../zh/data/date/months.tsv | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 nemo_text_processing/inverse_text_normalization/zh/data/date/months.tsv diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/date/months.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/date/months.tsv new file mode 100644 index 000000000..5b2f33539 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/zh/data/date/months.tsv @@ -0,0 +1,49 @@ +一 1 +二 2 +三 3 +四 4 +五 5 +六 6 +七 7 +八 8 +九 9 +十 10 +十一 11 +十二 12 +一十 10 +零一 1 +零二 2 +零三 3 +零四 4 +零五 5 +零六 6 +零七 7 +零八 8 +零九 9 +壹 1 +贰 2 +叁 3 +肆 4 +伍 5 +陆 6 +柒 7 +捌 8 +玖 9 +拾 10 +拾壹 11 +拾贰 12 +壹拾 10 +零壹 1 +零贰 2 +零叁 3 +零肆 4 +零伍 5 +零陆 6 +零柒 7 +零捌 8 +零玖 9 +貳 2 +零貳 2 +陸 6 +零陸 6 +拾貳 12 \ No newline at end of file From ae1f3a81b2d7b8bbd04946f812b114fc16255d77 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 31 May 2023 20:47:44 +0000 Subject: [PATCH 55/89] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../zh/taggers/money.py | 70 ++++++++++++++++--- .../zh/verbalizers/money.py | 35 ++++++++-- 2 files changed, 89 insertions(+), 16 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py index a85b55200..661f8cade 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py @@ -23,9 +23,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") # imports - major_currency = pynini.string_file(get_abs_path("data/money/currency_major.tsv")) - minor_currency = pynini.string_file(get_abs_path("data/money/currency_minor.tsv")) - digits = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) + major_currency = pynini.string_file(get_abs_path("data/money/currency_major.tsv")) + minor_currency = pynini.string_file(get_abs_path("data/money/currency_minor.tsv")) + digits = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) graph_cardinal = cardinal.for_ordinals graph_decimal = decimal.final_graph_wo_negative # @@ -81,14 +81,64 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) ### - #graph_kuaimao = graph_kuai + pynutil.insert(" ") + graph_mao + # graph_kuaimao = graph_kuai + pynutil.insert(" ") + graph_mao graph_digits = pynutil.insert('fractional_part: "') + digits + pynutil.insert('"') - graph_kuaimao = graph_kuai + pynutil.insert(" ") + graph_digits + pynutil.insert(" ") + pynutil.insert('currency_minor: "') + pynini.closure(tencent, 1, 1) + pynutil.insert('"') - graph_kuaifen = graph_kuai + pynutil.insert(" ") + graph_digits + pynutil.insert(" ") + pynutil.insert('currency_minor: "') + pynini.closure(cent, 1, 1) + pynutil.insert('"') - graph_maofen = pynutil.insert('fractional_part: "') + digits + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('currency_minor: "') + pynini.closure(tencent, 1, 1) + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('fraction_part: "') + digits + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('currency_min: "') + pynini.closure(cent, 1, 1) + pynutil.insert('"') - - graph_kuaimaofen = graph_kuai + pynutil.insert(" ") + pynutil.insert('fractional_part: "') + digits + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('currency_minor: "') + pynini.closure(tencent, 1, 1) + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('fraction_part: "') + digits + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('currency_min: "') + pynini.closure(cent, 1, 1) + pynutil.insert('"') - + graph_kuaimao = ( + graph_kuai + + pynutil.insert(" ") + + graph_digits + + pynutil.insert(" ") + + pynutil.insert('currency_minor: "') + + pynini.closure(tencent, 1, 1) + + pynutil.insert('"') + ) + graph_kuaifen = ( + graph_kuai + + pynutil.insert(" ") + + graph_digits + + pynutil.insert(" ") + + pynutil.insert('currency_minor: "') + + pynini.closure(cent, 1, 1) + + pynutil.insert('"') + ) + graph_maofen = ( + pynutil.insert('fractional_part: "') + + digits + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('currency_minor: "') + + pynini.closure(tencent, 1, 1) + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('fraction_part: "') + + digits + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('currency_min: "') + + pynini.closure(cent, 1, 1) + + pynutil.insert('"') + ) + + graph_kuaimaofen = ( + graph_kuai + + pynutil.insert(" ") + + pynutil.insert('fractional_part: "') + + digits + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('currency_minor: "') + + pynini.closure(tencent, 1, 1) + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('fraction_part: "') + + digits + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('currency_min: "') + + pynini.closure(cent, 1, 1) + + pynutil.insert('"') + ) + graph_mandarin = ( graph_kuai | graph_mao | graph_fen | graph_kuaimao | graph_kuaifen | graph_maofen | graph_kuaimaofen ) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py index c3522daa2..17d6071de 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py @@ -13,7 +13,12 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, NEMO_DIGIT +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( + NEMO_DIGIT, + NEMO_NOT_QUOTE, + GraphFst, + delete_space, +) from pynini.lib import pynutil @@ -50,21 +55,39 @@ def __init__(self): lesser_symbol = pynini.accep("分") major_currency = pynutil.delete('currency_major: "') + major_symbol + pynutil.delete('"') - minor_currency = pynutil.delete('currency_minor: "') + minor_symbol + pynutil.delete('"') + minor_currency = pynutil.delete('currency_minor: "') + minor_symbol + pynutil.delete('"') lesser_currency = pynutil.delete('currency_min:"') + lesser_symbol + pynutil.delete('"') graph_kuai = number_unit + delete_space + major_currency graph_mao = number_unit + delete_space + minor_currency graph_mao = number_unit + delete_space + minor_currency graph_fen = number_unit + delete_space + lesser_currency - + graph_kuaimao = graph_kuai + delete_space + fraction_unit + delete_space + minor_currency graph_kuaifen = graph_kuai + delete_space + fraction_unit + delete_space + lesser_currency - graph_maofen = pynutil.delete('fractional_part: "') + pynini.closure(NEMO_DIGIT, 1) + pynutil.delete('"') + delete_space + pynutil.delete('currency_minor: "') + minor_symbol + pynutil.delete('"') + delete_space + pynutil.delete('fraction_part: "') + pynini.closure(NEMO_DIGIT, 1) + pynutil.delete('"') + delete_space + pynutil.delete('currency_min: "') + lesser_symbol + pynutil.delete('"') + graph_maofen = ( + pynutil.delete('fractional_part: "') + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete('"') + + delete_space + + pynutil.delete('currency_minor: "') + + minor_symbol + + pynutil.delete('"') + + delete_space + + pynutil.delete('fraction_part: "') + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete('"') + + delete_space + + pynutil.delete('currency_min: "') + + lesser_symbol + + pynutil.delete('"') + ) graph_all = graph_kuai + delete_space + graph_maofen - - graph_mandarin = (graph_kuai | graph_mao | graph_fen) | graph_kuaimao | graph_kuaifen | graph_maofen | graph_all + + graph_mandarin = ( + (graph_kuai | graph_mao | graph_fen) | graph_kuaimao | graph_kuaifen | graph_maofen | graph_all + ) graph_verbalizer = graph_regular | pynutil.add_weight(graph_mandarin, -2.0) From dde41363204110b6d12e20151c811f22cab5d16d Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Wed, 31 May 2023 13:50:15 -0700 Subject: [PATCH 56/89] fixed codeing style/format Signed-off-by: BuyuanCui --- .../zh/taggers/money.py | 70 ++++++++++++++++--- 1 file changed, 60 insertions(+), 10 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py index a85b55200..661f8cade 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py @@ -23,9 +23,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") # imports - major_currency = pynini.string_file(get_abs_path("data/money/currency_major.tsv")) - minor_currency = pynini.string_file(get_abs_path("data/money/currency_minor.tsv")) - digits = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) + major_currency = pynini.string_file(get_abs_path("data/money/currency_major.tsv")) + minor_currency = pynini.string_file(get_abs_path("data/money/currency_minor.tsv")) + digits = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) graph_cardinal = cardinal.for_ordinals graph_decimal = decimal.final_graph_wo_negative # @@ -81,14 +81,64 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) ### - #graph_kuaimao = graph_kuai + pynutil.insert(" ") + graph_mao + # graph_kuaimao = graph_kuai + pynutil.insert(" ") + graph_mao graph_digits = pynutil.insert('fractional_part: "') + digits + pynutil.insert('"') - graph_kuaimao = graph_kuai + pynutil.insert(" ") + graph_digits + pynutil.insert(" ") + pynutil.insert('currency_minor: "') + pynini.closure(tencent, 1, 1) + pynutil.insert('"') - graph_kuaifen = graph_kuai + pynutil.insert(" ") + graph_digits + pynutil.insert(" ") + pynutil.insert('currency_minor: "') + pynini.closure(cent, 1, 1) + pynutil.insert('"') - graph_maofen = pynutil.insert('fractional_part: "') + digits + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('currency_minor: "') + pynini.closure(tencent, 1, 1) + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('fraction_part: "') + digits + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('currency_min: "') + pynini.closure(cent, 1, 1) + pynutil.insert('"') - - graph_kuaimaofen = graph_kuai + pynutil.insert(" ") + pynutil.insert('fractional_part: "') + digits + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('currency_minor: "') + pynini.closure(tencent, 1, 1) + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('fraction_part: "') + digits + pynutil.insert('"') + pynutil.insert(" ") + pynutil.insert('currency_min: "') + pynini.closure(cent, 1, 1) + pynutil.insert('"') - + graph_kuaimao = ( + graph_kuai + + pynutil.insert(" ") + + graph_digits + + pynutil.insert(" ") + + pynutil.insert('currency_minor: "') + + pynini.closure(tencent, 1, 1) + + pynutil.insert('"') + ) + graph_kuaifen = ( + graph_kuai + + pynutil.insert(" ") + + graph_digits + + pynutil.insert(" ") + + pynutil.insert('currency_minor: "') + + pynini.closure(cent, 1, 1) + + pynutil.insert('"') + ) + graph_maofen = ( + pynutil.insert('fractional_part: "') + + digits + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('currency_minor: "') + + pynini.closure(tencent, 1, 1) + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('fraction_part: "') + + digits + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('currency_min: "') + + pynini.closure(cent, 1, 1) + + pynutil.insert('"') + ) + + graph_kuaimaofen = ( + graph_kuai + + pynutil.insert(" ") + + pynutil.insert('fractional_part: "') + + digits + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('currency_minor: "') + + pynini.closure(tencent, 1, 1) + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('fraction_part: "') + + digits + + pynutil.insert('"') + + pynutil.insert(" ") + + pynutil.insert('currency_min: "') + + pynini.closure(cent, 1, 1) + + pynutil.insert('"') + ) + graph_mandarin = ( graph_kuai | graph_mao | graph_fen | graph_kuaimao | graph_kuaifen | graph_maofen | graph_kuaimaofen ) From 07d7e94b8e7a73395b531392fca6f6b6cc72768e Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Wed, 31 May 2023 13:50:33 -0700 Subject: [PATCH 57/89] fixed coding style and format Signed-off-by: BuyuanCui --- .../zh/verbalizers/money.py | 35 +++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py index c3522daa2..17d6071de 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py @@ -13,7 +13,12 @@ # limitations under the License. import pynini -from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, NEMO_DIGIT +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( + NEMO_DIGIT, + NEMO_NOT_QUOTE, + GraphFst, + delete_space, +) from pynini.lib import pynutil @@ -50,21 +55,39 @@ def __init__(self): lesser_symbol = pynini.accep("分") major_currency = pynutil.delete('currency_major: "') + major_symbol + pynutil.delete('"') - minor_currency = pynutil.delete('currency_minor: "') + minor_symbol + pynutil.delete('"') + minor_currency = pynutil.delete('currency_minor: "') + minor_symbol + pynutil.delete('"') lesser_currency = pynutil.delete('currency_min:"') + lesser_symbol + pynutil.delete('"') graph_kuai = number_unit + delete_space + major_currency graph_mao = number_unit + delete_space + minor_currency graph_mao = number_unit + delete_space + minor_currency graph_fen = number_unit + delete_space + lesser_currency - + graph_kuaimao = graph_kuai + delete_space + fraction_unit + delete_space + minor_currency graph_kuaifen = graph_kuai + delete_space + fraction_unit + delete_space + lesser_currency - graph_maofen = pynutil.delete('fractional_part: "') + pynini.closure(NEMO_DIGIT, 1) + pynutil.delete('"') + delete_space + pynutil.delete('currency_minor: "') + minor_symbol + pynutil.delete('"') + delete_space + pynutil.delete('fraction_part: "') + pynini.closure(NEMO_DIGIT, 1) + pynutil.delete('"') + delete_space + pynutil.delete('currency_min: "') + lesser_symbol + pynutil.delete('"') + graph_maofen = ( + pynutil.delete('fractional_part: "') + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete('"') + + delete_space + + pynutil.delete('currency_minor: "') + + minor_symbol + + pynutil.delete('"') + + delete_space + + pynutil.delete('fraction_part: "') + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete('"') + + delete_space + + pynutil.delete('currency_min: "') + + lesser_symbol + + pynutil.delete('"') + ) graph_all = graph_kuai + delete_space + graph_maofen - - graph_mandarin = (graph_kuai | graph_mao | graph_fen) | graph_kuaimao | graph_kuaifen | graph_maofen | graph_all + + graph_mandarin = ( + (graph_kuai | graph_mao | graph_fen) | graph_kuaimao | graph_kuaifen | graph_maofen | graph_all + ) graph_verbalizer = graph_regular | pynutil.add_weight(graph_mandarin, -2.0) From 6759721df22518a92aaf1cd15783d612a8eb7b1a Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Wed, 7 Jun 2023 14:13:37 -0700 Subject: [PATCH 58/89] =?UTF-8?q?removed=20duplicated=20graph=20for=20?= =?UTF-8?q?=E6=AF=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/verbalizers/money.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py index 17d6071de..2fd3919a4 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/money.py @@ -60,7 +60,6 @@ def __init__(self): graph_kuai = number_unit + delete_space + major_currency graph_mao = number_unit + delete_space + minor_currency - graph_mao = number_unit + delete_space + minor_currency graph_fen = number_unit + delete_space + lesser_currency graph_kuaimao = graph_kuai + delete_space + fraction_unit + delete_space + minor_currency From 60fd16c2e4c5090b942449e17379709d7bac6f4f Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 27 Jun 2023 07:35:42 -0700 Subject: [PATCH 59/89] removed the comment Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/taggers/money.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py index 661f8cade..d0a24ab3b 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py @@ -80,8 +80,6 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + pynutil.insert('"') ) - ### - # graph_kuaimao = graph_kuai + pynutil.insert(" ") + graph_mao graph_digits = pynutil.insert('fractional_part: "') + digits + pynutil.insert('"') graph_kuaimao = ( graph_kuai From a4bc7cc66337e1953a2e2efda19253ae2f7386f1 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 27 Jun 2023 07:37:56 -0700 Subject: [PATCH 60/89] removed the comment Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/taggers/ordinal.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py index b1eacaa09..47ffbdd36 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/ordinal.py @@ -17,7 +17,6 @@ from pynini.lib import pynutil -# A third way, seems to work fine but might have potential issues? class OrdinalFst(GraphFst): def __init__(self, cardinal: GraphFst): super().__init__(name="ordinal", kind="classify") From bea168ed1445dbd84078ac13045442fa439ffc0c Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 27 Jun 2023 08:18:02 -0700 Subject: [PATCH 61/89] removing unnecessary comments Signed-off-by: BuyuanCui --- .../zh/taggers/tokenize_and_classify.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py index 70018a8c1..957f886cd 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py @@ -26,16 +26,10 @@ from nemo_text_processing.inverse_text_normalization.zh.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.zh.taggers.date import DateFst from nemo_text_processing.inverse_text_normalization.zh.taggers.decimal import DecimalFst - -# from nemo_text_processing.inverse_text_normalization.zh.taggers.electronic import ElectronicFst from nemo_text_processing.inverse_text_normalization.zh.taggers.fraction import FractionFst - -# from nemo_text_processing.inverse_text_normalization.zh.taggers.measure import MeasureFst from nemo_text_processing.inverse_text_normalization.zh.taggers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.zh.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.zh.taggers.punctuation import PunctuationFst - -# from nemo_text_processing.inverse_text_normalization.zh.taggers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.zh.taggers.time import TimeFst from nemo_text_processing.inverse_text_normalization.zh.taggers.whitelist import WhiteListFst from nemo_text_processing.inverse_text_normalization.zh.taggers.word import WordFst @@ -55,7 +49,7 @@ class ClassifyFst(GraphFst): def __init__( self, - input_case: INPUT_LOWER_CASED, + input_case: str, cache_dir: str = None, whitelist: str = None, overwrite_cache: bool = False, From d4905ce51c2a0934ce06ca46e540501ba861a283 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 27 Jun 2023 08:18:35 -0700 Subject: [PATCH 62/89] unnecessary comment removed Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/verbalizers/verbalize.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py index 8a1ae31da..b379c4d94 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize.py @@ -16,15 +16,9 @@ from nemo_text_processing.inverse_text_normalization.zh.verbalizers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.zh.verbalizers.date import DateFst from nemo_text_processing.inverse_text_normalization.zh.verbalizers.decimal import DecimalFst - -# from nemo_text_processing.inverse_text_normalization.zh.verbalizers.electronic import ElectronicFst from nemo_text_processing.inverse_text_normalization.zh.verbalizers.fraction import FractionFst - -# from nemo_text_processing.inverse_text_normalization.zh.verbalizers.measure import MeasureFst from nemo_text_processing.inverse_text_normalization.zh.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.zh.verbalizers.ordinal import OrdinalFst - -# from nemo_text_processing.inverse_text_normalization.zh.verbalizers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.zh.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.zh.verbalizers.whitelist import WhiteListFst From 92cbc07280bb7a492c069f4e21f786d61b84a001 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 27 Jun 2023 08:47:12 -0700 Subject: [PATCH 63/89] test file updated for more cases Signed-off-by: BuyuanCui --- .../zh/data_inverse_text_normalization/test_cases_time.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_time.txt index 922a2ea0e..01b2a5d15 100644 --- a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_time.txt @@ -17,4 +17,7 @@ 十点~10点 五分钟~5分钟 五秒钟~5秒钟 -十三点五分~13:05 \ No newline at end of file +十三点五分~13:05 +十三点零五分~13:05 +五点二十五分~05:25 +十一点三十四分~11:34 \ No newline at end of file From 12fb0361536cac56a77ebe000c99e030e3d46246 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 27 Jun 2023 15:48:20 +0000 Subject: [PATCH 64/89] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../zh/taggers/tokenize_and_classify.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py index 957f886cd..920f0e874 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py @@ -48,11 +48,7 @@ class ClassifyFst(GraphFst): """ def __init__( - self, - input_case: str, - cache_dir: str = None, - whitelist: str = None, - overwrite_cache: bool = False, + self, input_case: str, cache_dir: str = None, whitelist: str = None, overwrite_cache: bool = False, ): super().__init__(name="tokenize_and_classify", kind="classify") From 545a54ad17f9d680917ced63577ed0ca1df061d9 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 27 Jun 2023 08:50:24 -0700 Subject: [PATCH 65/89] updated with a comment explaining why this file is kept Signed-off-by: BuyuanCui --- nemo_text_processing/text_normalization/zh/graph_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo_text_processing/text_normalization/zh/graph_utils.py b/nemo_text_processing/text_normalization/zh/graph_utils.py index fd609ab5e..37cb14d45 100644 --- a/nemo_text_processing/text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/text_normalization/zh/graph_utils.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +# gaph_utils is kept due to the fatc that importing from en folders will cause import errors that the data file names have to be the same with what are in the en folder import logging import os import string From d058421ed4aded651e0717256a899c51aae76cbf Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 27 Jun 2023 08:50:41 -0700 Subject: [PATCH 66/89] updated the file explaining why this file is kept Signed-off-by: BuyuanCui --- nemo_text_processing/text_normalization/zh/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nemo_text_processing/text_normalization/zh/utils.py b/nemo_text_processing/text_normalization/zh/utils.py index b3d03c602..2d78a8ea0 100644 --- a/nemo_text_processing/text_normalization/zh/utils.py +++ b/nemo_text_processing/text_normalization/zh/utils.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +# gaph_utils is kept due to the fatc that importing from en folders will cause import errors that the data file names have to be the same with what are in the en folder import csv import os From be6781830fd52a8456ce452e1327fcaf7ff31cbb Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 27 Jun 2023 15:57:09 -0700 Subject: [PATCH 67/89] added Mandarin as zh Signed-off-by: BuyuanCui --- .../inverse_text_normalization/inverse_normalize.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index e210fc4dc..69dc865b0 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -101,6 +101,11 @@ def __init__( from nemo_text_processing.inverse_text_normalization.es_en.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) + elif lang == 'zh': # Mandarin + from nemo_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.inverse_text_normalization.zh.verbalizers.verbalize_final import ( + VerbalizeFinalFst, + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case From 476fa613615c56d4ca003b355c347c9b637c1044 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 27 Jun 2023 16:23:28 -0700 Subject: [PATCH 68/89] removing for dplication Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/graph_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py index 798ffc189..ea64cb69f 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -18,7 +18,7 @@ from pathlib import Path from typing import Dict -import pynini +#import pynini from pynini import Far from pynini.export import export from pynini.lib import byte, pynutil, utf8 From 06768d2df2122afde55f16070cb8d5917cdf10e4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 27 Jun 2023 23:23:48 +0000 Subject: [PATCH 69/89] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/zh/graph_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py index ea64cb69f..a85292f86 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -18,7 +18,7 @@ from pathlib import Path from typing import Dict -#import pynini +# import pynini from pynini import Far from pynini.export import export from pynini.lib import byte, pynutil, utf8 From d5c40252d40e5166339b1db5cf4f8f36915ff126 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 27 Jun 2023 16:53:02 -0700 Subject: [PATCH 70/89] removed unused NEMO objects Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/graph_utils.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py index ea64cb69f..690aedc19 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -18,7 +18,7 @@ from pathlib import Path from typing import Dict -#import pynini +import pynini from pynini import Far from pynini.export import export from pynini.lib import byte, pynutil, utf8 @@ -26,20 +26,13 @@ NEMO_CHAR = utf8.VALID_UTF8_CHAR NEMO_DIGIT = byte.DIGIT -NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() -NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() -NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() -NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() NEMO_HEX = pynini.union(*string.hexdigits).optimize() NEMO_NON_BREAKING_SPACE = "\u00A0" NEMO_SPACE = " " NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() - NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() -NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() - NEMO_SIGMA = pynini.closure(NEMO_CHAR) delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) From cbf6ffceb240cf2f200b5b3cfd8fb06d0931273d Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 27 Jun 2023 17:42:39 -0700 Subject: [PATCH 71/89] removed duplicates Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/graph_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py index e4d5472d9..13e8ab6d0 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -19,7 +19,6 @@ from typing import Dict import pynini -from pynini import Far from pynini.export import export from pynini.lib import byte, pynutil, utf8 @@ -133,7 +132,7 @@ def __init__(self, name: str, kind: str, deterministic: bool = True): self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far") if self.far_exist(): - self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + self._fst = pynini.Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() def far_exist(self) -> bool: """ From 2cd9af40fd4334b85625793fd29c9c82710b320b Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Wed, 28 Jun 2023 07:53:25 -0700 Subject: [PATCH 72/89] removing unsed imports Signed-off-by: BuyuanCui --- .../zh/taggers/tokenize_and_classify.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py index 920f0e874..a46563170 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py @@ -17,7 +17,6 @@ import pynini from nemo_text_processing.inverse_text_normalization.zh.graph_utils import ( - INPUT_LOWER_CASED, GraphFst, delete_extra_space, delete_space, From cb7fb16f0570f12ebe30c5dc5eb66ca67ec11e45 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 29 Jun 2023 14:47:30 -0700 Subject: [PATCH 73/89] updates to fix test file failures Signed-off-by: BuyuanCui --- .../zh/data/money/currency_major.tsv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major.tsv index d9b1a6c8f..9761245b8 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major.tsv +++ b/nemo_text_processing/inverse_text_normalization/zh/data/money/currency_major.tsv @@ -70,6 +70,6 @@ 日元 JPY¥ 日圆 JPY¥ 日圓 JPY¥ -人民币 ¥ -人民幣 ¥ +人民币 ¥ +人民幣 ¥ 元 ¥ \ No newline at end of file From 7425d8975ec64d6681ccd29f6c89b7e9b42e270c Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 29 Jun 2023 14:47:50 -0700 Subject: [PATCH 74/89] updates to fix file failtures Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/data/time/time_mandarin.tsv | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo_text_processing/inverse_text_normalization/zh/data/time/time_mandarin.tsv b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_mandarin.tsv index 27bc5539a..7fd465fa1 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/data/time/time_mandarin.tsv +++ b/nemo_text_processing/inverse_text_normalization/zh/data/time/time_mandarin.tsv @@ -1,5 +1,6 @@ 一 1 二 2 +两 2 三 3 四 4 五 5 From ee19a6ab59dbe1df993b38ce49f85cc5fe33f94f Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 29 Jun 2023 14:48:19 -0700 Subject: [PATCH 75/89] updates to resolve test case failture Signed-off-by: BuyuanCui --- .../zh/taggers/cardinal.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index 8ad7a597d..e1a0e0106 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -46,18 +46,18 @@ def __init__(self): graph_digits = digits | pynutil.insert("0") # grammar for teens - ten = pynini.string_map([("十", "1"), ("拾", "1"), ("壹拾", "1"), ("壹拾", "1")]) + ten = pynini.string_map([("十", "1"), ("拾", "1"), ("壹拾", "1"), ("一十", "1")]) graph_teens = ten + graph_digits - graph_teens = graph_teens | pynutil.insert("0") + graph_teens = graph_teens #| pynutil.insert("00") # grammar for tens, not the output for Cardinal grammar but for pure Arabic digits (used in other grammars) graph_tens = (ties + graph_digits) | (pynini.cross(pynini.accep("零"), "0") + graph_digits) - graph_all = graph_tens | pynutil.insert("00") + graph_all = graph_tens | graph_teens | pynutil.insert("00") # grammar for hundreds 百 graph_hundreds_complex = (graph_digits + delete_hundreds + graph_all) | ( graph_digits + delete_hundreds + pynini.cross(pynini.closure("零"), "0") + graph_digits - ) + ) | (graph_digits + delete_hundreds + graph_teens) graph_hundreds = graph_hundreds_complex graph_hundreds = graph_hundreds | pynutil.insert("000") @@ -82,10 +82,10 @@ def __init__(self): # grammmar for hundred thousands 十万 graph_hundred_thousands_simple = graph_all + closure_ten_thousands graph_hundred_thousands_complex = ( - (graph_tens + delete_ten_thousands + graph_thousands_complex) - | (graph_tens + delete_ten_thousands + pynini.cross(pynini.closure("零"), "0") + graph_hundreds_complex) - | (graph_tens + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) - | (graph_tens + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) + (graph_all + delete_ten_thousands + graph_thousands_complex) + | (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "0") + graph_hundreds_complex) + | (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) + | (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) ) graph_hundred_thousands = (graph_hundred_thousands_simple | graph_hundred_thousands_complex) | pynutil.insert( "000000" From 34c57027f2b4dd8c6148084d3155bc2bf945be7d Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 29 Jun 2023 14:48:45 -0700 Subject: [PATCH 76/89] updates to resolve test case failure Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/taggers/decimal.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py index 95bfd30c8..80729d8fe 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py @@ -86,13 +86,13 @@ def __init__(self, cardinal: GraphFst): # grammar for fractional part delete_zero = pynini.closure(pynini.cross("零", "0")) graph_string_of_cardinals = cardinal_after_decimal - graph_string_of_cardinals = ( + graph_string_of_cardinals = pynini.closure(( pynini.closure(graph_string_of_cardinals) + delete_zero + pynini.closure(graph_string_of_cardinals) - ) - graph_fractional = pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"') + ), 1) + graph_fractional = pynini.closure(pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"'),1) # grammar for decimal: integer+delete character+part after decimal point - graph_decimal_no_sign = graph_integer_or_none + delete_decimal + graph_fractional + graph_decimal_no_sign = pynini.closure((graph_integer_or_none + delete_decimal + graph_fractional), 1) # New Grammar added for Money self.final_graph_wo_negative = graph_decimal_no_sign | get_quantity( From 18832400d318247362b90e9ff7aacf7d5dc6c6ef Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 29 Jun 2023 14:49:05 -0700 Subject: [PATCH 77/89] updates to resolve test case failure Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/taggers/time.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py index 240ac03ee..adba3e850 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py @@ -62,7 +62,7 @@ def __init__(self): + graph_minutes_component + pynutil.insert(" ") + graph_seconds_component - ) + ) quarter_mandarin = ( quarters + pynini.accep("刻") | pynini.cross("刻鈡", "刻钟") | pynini.accep("刻钟") | pynini.accep("半") @@ -82,7 +82,7 @@ def __init__(self): | pynini.cross("個小時", "个小时") ) minute_mandarin = pynini.accep("分") | pynini.cross("分鐘", "分钟") | pynini.accep("分钟") - second_mandarin = pynini.accep("秒") | pynini.cross("秒鐘", "秒钟") + second_mandarin = pynini.accep("秒") | pynini.cross("秒鐘", "秒钟") | pynini.accep("秒钟") hours_only = for_mandarin + hour_mandarin minutes_only = for_mandarin + minute_mandarin From b05356f76eb4c97323559a58cdee035e94de7635 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 29 Jun 2023 14:49:33 -0700 Subject: [PATCH 78/89] updates to resolve test case failure Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/verbalizers/time.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py index e39b200d1..f735d1cb7 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py @@ -70,7 +70,7 @@ def __init__(self): | pynini.accep("时") | pynini.accep("个钟头") | pynini.accep("个点") - | pynini.accep("半") + | pynini.accep("个小时") ) hour_mandarin = ( pynutil.delete("hours:") @@ -79,12 +79,12 @@ def __init__(self): + (pynini.closure(NEMO_DIGIT) + pynini.closure(hours, 1)) + pynutil.delete('"') ) - minutes = pynini.accep("分") | pynini.accep("分钟") + minutes = pynini.accep("分") | pynini.accep("分钟") | pynini.accep("半") minute_mandarin = ( pynutil.delete("minutes:") + delete_space + pynutil.delete('"') - + (pynini.closure(NEMO_DIGIT) + pynini.closure(minutes, 1)) + + (((pynini.closure(NEMO_DIGIT) + pynini.closure(minutes, 1))) | pynini.closure(minutes, 1)) + pynutil.delete('"') ) seconds = pynini.accep("秒") | pynini.accep("秒钟") @@ -109,7 +109,10 @@ def __init__(self): | minute_mandarin | second_mandarin | quarter_mandarin - | (hour_mandarin + delete_space + quarter_mandarin) + | (hour_mandarin + delete_space + quarter_mandarin) + | (hour_mandarin + delete_space + minute_mandarin) + | (hour_mandarin + delete_space + minute_mandarin + delete_space + second_mandarin) + | (minute_mandarin + delete_space + second_mandarin) ) final_graph = graph_regular_time | graph_mandarin_time From 9d53722ca259f152268a54eb000bc56b33193747 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 29 Jun 2023 14:50:05 -0700 Subject: [PATCH 79/89] updates to adap to cardinal grammar changes Signed-off-by: BuyuanCui --- .../data_inverse_text_normalization/test_cases_cardinal.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_cardinal.txt index 37052db4a..02d3dcbcf 100644 --- a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_cardinal.txt @@ -5,7 +5,7 @@ 九百~900 九百五十~950 九百五十一~951 -一千~1000 +一千~1,000 一千零一~1,001 一千一百~1,100 一千一百零一~1,101 @@ -13,7 +13,7 @@ 一千一百一十~1,110 一千一百十~1,110 一千一百一十一~1,111 -两千~2千 +两千~2,000 九千九百九十九~9,999 一万一千~11,000 一万一千一百~11,100 From bf4868ae69b7c5bae34697bc59af3f4151c42b95 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 29 Jun 2023 14:50:38 -0700 Subject: [PATCH 80/89] updates to adapt to grammar changes Signed-off-by: BuyuanCui --- .../test_cases_money.txt | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_money.txt index 2d1311597..2504e7d44 100644 --- a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_money.txt @@ -1,47 +1,47 @@ -一千美元~US$1千 -五千美元~US$5千 +一千美元~US$1000 +五千美元~US$5000 一万美元~US$1万 一点五万美元~US$1.5万 五十万美元~US$50万 一百万美元~US$100万 一千万美元~US$1000万 -一千元~¥1千 -五千元~¥5千 -一万元~¥1万 -一千五万元~¥1005万 -五十万元~¥50万 -一百万元~¥100万 -一千万元~¥1000万 -一千欧元~€1千 -五千欧元~€5千 +一千元~¥1000 +五千元~¥5000 +一万元~¥1万 +一千五万元~¥1005万 +五十万元~¥50万 +一百万元~¥100万 +一千万元~¥1000万 +一千欧元~€1000 +五千欧元~€5000 一万欧元~€1万 一点五万欧元~€1.5万 五十万欧元~€50万 一百万欧元~€100万 一千万欧元~€1000万 -一千英镑~£1千 -五千英镑~£5千 +一千英镑~£1000 +五千英镑~£5000 一万英镑~£1万 一点五万英镑~£1.5万 五十万英镑~£50万 一百万英镑~£100万 一千万英镑~£1000万 -一千韩元~₩1千 -五千韩元~₩5千 +一千韩元~₩1000 +五千韩元~₩5000 一万韩元~₩1万 一点五万韩元~₩1.5万 五十万韩元~₩50万 一百万韩元~₩100万 一千万韩元~₩1000万 -一千印度卢布~₹1千 -五千印度卢布~₹5千 +一千印度卢布~₹1000 +五千印度卢布~₹5000 一万印度卢布~₹1万 一点五万印度卢布~₹1.5万 五十万印度卢布~₹50万 一百万印度卢布~₹100万 一千万印度卢布~₹1000万 -一千日元~JPY¥1千 -五千日元~JPY¥5千 +一千日元~JPY¥1000 +五千日元~JPY¥5000 一万日元~JPY¥1万 一点五万日元~JPY¥1.5万 五十万日元~JPY¥50万 From a8b7e7227681a7a3e8df98ee09e313dc5a066bfc Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 29 Jun 2023 14:51:06 -0700 Subject: [PATCH 81/89] updates to adopt to cardinal grammar changes Signed-off-by: BuyuanCui --- .../zh/data_inverse_text_normalization/test_cases_ordinal.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_ordinal.txt index d6e15f1dd..828ec6203 100644 --- a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_ordinal.txt +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_ordinal.txt @@ -3,7 +3,7 @@ 第兩萬一千一百一十一~第21111 第一百~第100 第二百~第200 -第兩千~第2千 +第兩千~第2000 第两万~第2万 第十万~第10万 第一百万~第100万 From 5f58f5224e33a6d87826c376c6c5bd0988f2b741 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 29 Jun 2023 21:51:40 +0000 Subject: [PATCH 82/89] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/zh/taggers/cardinal.py | 10 ++++++---- .../inverse_text_normalization/zh/taggers/decimal.py | 10 ++++++---- .../inverse_text_normalization/zh/taggers/time.py | 2 +- .../inverse_text_normalization/zh/verbalizers/time.py | 2 +- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index e1a0e0106..e1235c193 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -48,16 +48,18 @@ def __init__(self): # grammar for teens ten = pynini.string_map([("十", "1"), ("拾", "1"), ("壹拾", "1"), ("一十", "1")]) graph_teens = ten + graph_digits - graph_teens = graph_teens #| pynutil.insert("00") + graph_teens = graph_teens # | pynutil.insert("00") # grammar for tens, not the output for Cardinal grammar but for pure Arabic digits (used in other grammars) graph_tens = (ties + graph_digits) | (pynini.cross(pynini.accep("零"), "0") + graph_digits) graph_all = graph_tens | graph_teens | pynutil.insert("00") # grammar for hundreds 百 - graph_hundreds_complex = (graph_digits + delete_hundreds + graph_all) | ( - graph_digits + delete_hundreds + pynini.cross(pynini.closure("零"), "0") + graph_digits - ) | (graph_digits + delete_hundreds + graph_teens) + graph_hundreds_complex = ( + (graph_digits + delete_hundreds + graph_all) + | (graph_digits + delete_hundreds + pynini.cross(pynini.closure("零"), "0") + graph_digits) + | (graph_digits + delete_hundreds + graph_teens) + ) graph_hundreds = graph_hundreds_complex graph_hundreds = graph_hundreds | pynutil.insert("000") diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py index 80729d8fe..f334f2675 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py @@ -86,10 +86,12 @@ def __init__(self, cardinal: GraphFst): # grammar for fractional part delete_zero = pynini.closure(pynini.cross("零", "0")) graph_string_of_cardinals = cardinal_after_decimal - graph_string_of_cardinals = pynini.closure(( - pynini.closure(graph_string_of_cardinals) + delete_zero + pynini.closure(graph_string_of_cardinals) - ), 1) - graph_fractional = pynini.closure(pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"'),1) + graph_string_of_cardinals = pynini.closure( + (pynini.closure(graph_string_of_cardinals) + delete_zero + pynini.closure(graph_string_of_cardinals)), 1 + ) + graph_fractional = pynini.closure( + pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"'), 1 + ) # grammar for decimal: integer+delete character+part after decimal point graph_decimal_no_sign = pynini.closure((graph_integer_or_none + delete_decimal + graph_fractional), 1) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py index adba3e850..9a3aca388 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py @@ -62,7 +62,7 @@ def __init__(self): + graph_minutes_component + pynutil.insert(" ") + graph_seconds_component - ) + ) quarter_mandarin = ( quarters + pynini.accep("刻") | pynini.cross("刻鈡", "刻钟") | pynini.accep("刻钟") | pynini.accep("半") diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py index f735d1cb7..4560fdf62 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py @@ -109,7 +109,7 @@ def __init__(self): | minute_mandarin | second_mandarin | quarter_mandarin - | (hour_mandarin + delete_space + quarter_mandarin) + | (hour_mandarin + delete_space + quarter_mandarin) | (hour_mandarin + delete_space + minute_mandarin) | (hour_mandarin + delete_space + minute_mandarin + delete_space + second_mandarin) | (minute_mandarin + delete_space + second_mandarin) From b8ed959da0c35ec70581fd6fe45c94ff8eeac167 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 29 Jun 2023 14:53:01 -0700 Subject: [PATCH 83/89] fix style Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/taggers/cardinal.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index e1a0e0106..e1235c193 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -48,16 +48,18 @@ def __init__(self): # grammar for teens ten = pynini.string_map([("十", "1"), ("拾", "1"), ("壹拾", "1"), ("一十", "1")]) graph_teens = ten + graph_digits - graph_teens = graph_teens #| pynutil.insert("00") + graph_teens = graph_teens # | pynutil.insert("00") # grammar for tens, not the output for Cardinal grammar but for pure Arabic digits (used in other grammars) graph_tens = (ties + graph_digits) | (pynini.cross(pynini.accep("零"), "0") + graph_digits) graph_all = graph_tens | graph_teens | pynutil.insert("00") # grammar for hundreds 百 - graph_hundreds_complex = (graph_digits + delete_hundreds + graph_all) | ( - graph_digits + delete_hundreds + pynini.cross(pynini.closure("零"), "0") + graph_digits - ) | (graph_digits + delete_hundreds + graph_teens) + graph_hundreds_complex = ( + (graph_digits + delete_hundreds + graph_all) + | (graph_digits + delete_hundreds + pynini.cross(pynini.closure("零"), "0") + graph_digits) + | (graph_digits + delete_hundreds + graph_teens) + ) graph_hundreds = graph_hundreds_complex graph_hundreds = graph_hundreds | pynutil.insert("000") From abdb582cbee19ae77d86f5684a84d187b09f79ff Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 29 Jun 2023 14:53:14 -0700 Subject: [PATCH 84/89] fix style Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/taggers/decimal.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py index 80729d8fe..f334f2675 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py @@ -86,10 +86,12 @@ def __init__(self, cardinal: GraphFst): # grammar for fractional part delete_zero = pynini.closure(pynini.cross("零", "0")) graph_string_of_cardinals = cardinal_after_decimal - graph_string_of_cardinals = pynini.closure(( - pynini.closure(graph_string_of_cardinals) + delete_zero + pynini.closure(graph_string_of_cardinals) - ), 1) - graph_fractional = pynini.closure(pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"'),1) + graph_string_of_cardinals = pynini.closure( + (pynini.closure(graph_string_of_cardinals) + delete_zero + pynini.closure(graph_string_of_cardinals)), 1 + ) + graph_fractional = pynini.closure( + pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"'), 1 + ) # grammar for decimal: integer+delete character+part after decimal point graph_decimal_no_sign = pynini.closure((graph_integer_or_none + delete_decimal + graph_fractional), 1) From ce7919b14b59d01db43cc03b7010be188db01d22 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 29 Jun 2023 14:53:27 -0700 Subject: [PATCH 85/89] fix style Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/taggers/time.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py index adba3e850..9a3aca388 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/time.py @@ -62,7 +62,7 @@ def __init__(self): + graph_minutes_component + pynutil.insert(" ") + graph_seconds_component - ) + ) quarter_mandarin = ( quarters + pynini.accep("刻") | pynini.cross("刻鈡", "刻钟") | pynini.accep("刻钟") | pynini.accep("半") From 40786188da7d9d63f875cfe7f196085f99e8c34a Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 29 Jun 2023 14:53:41 -0700 Subject: [PATCH 86/89] fix style Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/verbalizers/time.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py index f735d1cb7..4560fdf62 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/time.py @@ -109,7 +109,7 @@ def __init__(self): | minute_mandarin | second_mandarin | quarter_mandarin - | (hour_mandarin + delete_space + quarter_mandarin) + | (hour_mandarin + delete_space + quarter_mandarin) | (hour_mandarin + delete_space + minute_mandarin) | (hour_mandarin + delete_space + minute_mandarin + delete_space + second_mandarin) | (minute_mandarin + delete_space + second_mandarin) From 3af314125eef0c2dde2f8e928b72b8ae3c47b8aa Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 29 Jun 2023 15:20:33 -0700 Subject: [PATCH 87/89] fixing pr checks Signed-off-by: BuyuanCui --- .../inverse_text_normalization/zh/taggers/cardinal.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index e1235c193..b29fc5fb3 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -48,7 +48,6 @@ def __init__(self): # grammar for teens ten = pynini.string_map([("十", "1"), ("拾", "1"), ("壹拾", "1"), ("一十", "1")]) graph_teens = ten + graph_digits - graph_teens = graph_teens # | pynutil.insert("00") # grammar for tens, not the output for Cardinal grammar but for pure Arabic digits (used in other grammars) graph_tens = (ties + graph_digits) | (pynini.cross(pynini.accep("零"), "0") + graph_digits) From f9c6d158845489c86ce053cdbc2c37db1f6b0273 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 29 Jun 2023 18:00:55 -0700 Subject: [PATCH 88/89] removed // for zhtn/itn cache Signed-off-by: BuyuanCui --- Jenkinsfile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 493f85a6d..2a400c925 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-29-23-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } @@ -319,11 +319,11 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=zh --text="你" --cache_dir ${ZH_TN_CACHE}' } } - // stage('L0: ZH ITN grammars') { - // steps { - // sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=zh --text="二零零二年一月二十八日 " --cache_dir ${ZH_TN_CACHE}' - // } - // } + stage('L0: ZH ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=zh --text="二零零二年一月二十八日 " --cache_dir ${ZH_TN_CACHE}' + } + } } } From 820b80ddd570cef2d886235ac476579e4d3b62f4 Mon Sep 17 00:00:00 2001 From: "Buyuan(Alex) Cui" <69030297+BuyuanCui@users.noreply.github.com> Date: Thu, 29 Jun 2023 18:12:21 -0700 Subject: [PATCH 89/89] Update inverse_normalize.py Added zh as a selection to pass Jenkins checks. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> --- .../inverse_text_normalization/inverse_normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index 69dc865b0..5f73e601c 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -150,7 +150,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'vi', 'ar', 'es_en'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'vi', 'ar', 'es_en', 'zh'], default="en", type=str, )