From ed8f7d6caea66567933a86e05e4c0257ab2eb2eb Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Sun, 9 Apr 2023 18:57:58 +0000 Subject: [PATCH 1/5] port fix for telephone from swedish-itn branch Signed-off-by: Jim O'Regan --- .../sv/taggers/telephone.py | 48 ++++++++++++++----- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/nemo_text_processing/text_normalization/sv/taggers/telephone.py b/nemo_text_processing/text_normalization/sv/taggers/telephone.py index cc2666c9f..7d85072ef 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/sv/taggers/telephone.py @@ -15,6 +15,7 @@ import pynini from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, NEMO_SPACE, GraphFst, delete_extra_space, @@ -42,7 +43,7 @@ class TelephoneFst(GraphFst): 0XX-XXX XX 0XXX-XX XX XX 0XXX-XXX XX - + See: https://en.wikipedia.org/wiki/National_conventions_for_writing_telephone_numbers#Sweden https://codegolf.stackexchange.com/questions/195787/format-a-swedish-phone-number @@ -70,14 +71,23 @@ def __init__(self, deterministic: bool = True): special_numbers = pynini.string_file(get_abs_path("data/telephone/special_numbers.tsv")) + passable = pynini.union(":", ": ", " ") + prompt_pass = pynini.closure(pynutil.delete(passable) + insert_space, 0, 1) telephone_abbr = pynini.string_file(get_abs_path("data/telephone/telephone_abbr.tsv")) + telephone_abbr = telephone_abbr + prompt_pass telephone_prompt = pynini.string_file(get_abs_path("data/telephone/telephone_prompt.tsv")) - prompt = pynutil.insert("prompt: \"") + telephone_prompt + pynutil.insert("\"") - prompt |= pynutil.insert("prompt: \"") + telephone_abbr + pynutil.insert("\"") - prompt |= pynutil.insert("prompt: \"") + telephone_prompt + NEMO_SPACE + telephone_abbr + pynutil.insert("\"") + prompt_as_code = pynutil.insert("country_code: \"") + telephone_prompt + pynutil.insert("\"") + prompt_as_code |= pynutil.insert("country_code: \"") + telephone_abbr + pynutil.insert("\"") + prompt_as_code |= ( + pynutil.insert("country_code: \"") + telephone_prompt + NEMO_SPACE + telephone_abbr + pynutil.insert("\"") + ) + prompt_inner = telephone_prompt | telephone_abbr + prompt_inner |= telephone_prompt + NEMO_SPACE + telephone_abbr - country_code = pynini.closure(pynini.cross("+", "plus "), 0, 1) + one_two_or_three_digits - country_code = pynutil.insert("country_code: \"") + country_code + pynutil.insert("\"") + country = pynini.closure(pynini.cross("+", "plus "), 0, 1) + one_two_or_three_digits + country_code = pynutil.insert("country_code: \"") + country + pynutil.insert("\"") + country_code |= prompt_as_code + country_code |= pynutil.insert("country_code: \"") + prompt_inner + NEMO_SPACE + country + pynutil.insert("\"") opt_dash = pynini.closure(pynutil.delete("-"), 0, 1) area_part = zero_after_country_code + one_two_or_three_digits + opt_dash + add_separator @@ -102,19 +112,33 @@ def __init__(self, deterministic: bool = True): prompt_pass = pynutil.delete(passable) + insert_space special_numbers = pynutil.insert("number_part: \"") + special_numbers + pynutil.insert("\"") - prompt = prompt + prompt_pass graph = pynini.union( country_code + ensure_space + number_part, country_code + ensure_space + number_part + ext_prompt + extension, number_part + ext_prompt + extension, - prompt + number_part, - prompt + special_numbers, - prompt + country_code + number_part, - prompt + country_code + number_part + ext_prompt + extension, - prompt + number_part + ext_prompt + extension, + country_code + number_part, + country_code + special_numbers, + country_code + number_part + ext_prompt + extension, ) self.tel_graph = graph.optimize() + # No need to be so exact here, but better for ITN to have it + three_digit_area_code_digit_two = pynini.union("1", "2", "3", "4", "7") + three_digit_area_code_no_zero = (three_digit_area_code_digit_two + NEMO_DIGIT) @ cardinal.two_digits_read + three_digit_area_code = zero_space + three_digit_area_code_no_zero + four_digit_area_code_digit_two = pynini.union("5", "6", "9") + four_digit_area_code_no_zero = (four_digit_area_code_digit_two + NEMO_DIGIT) @ cardinal.three_digits_read + four_digit_area_code = zero_space + four_digit_area_code_no_zero + two_digit_area_code = "08" @ cardinal.two_digits_read + self.area_codes = two_digit_area_code | three_digit_area_code | four_digit_area_code + self.area_codes_no_zero = ( + three_digit_area_code_no_zero | four_digit_area_code_no_zero | pynini.cross("8", "åtta") + ) + country_code_lead = pynini.cross("+", "plus") | pynini.cross("00", "noll noll") + raw_country_codes = pynini.string_file(get_abs_path("data/telephone/country_codes.tsv")) + self.country_code = country_code_lead + insert_space + (raw_country_codes @ cardinal.any_read_digit) + self.country_plus_area_code = self.country_code + NEMO_SPACE + self.area_codes_no_zero + # ip ip_prompts = pynini.string_file(get_abs_path("data/telephone/ip_prompt.tsv")) ip_graph = one_two_or_three_digits + (pynini.cross(".", " punkt ") + one_two_or_three_digits) ** 3 From bba63b1af7fe42b920bda1bec29ac2c4b63096ca Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Sun, 9 Apr 2023 18:58:59 +0000 Subject: [PATCH 2/5] extend cardinal in non-deterministic mode Signed-off-by: Jim O'Regan --- nemo_text_processing/text_normalization/sv/taggers/cardinal.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nemo_text_processing/text_normalization/sv/taggers/cardinal.py b/nemo_text_processing/text_normalization/sv/taggers/cardinal.py index 0e2eae976..1096e53aa 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/sv/taggers/cardinal.py @@ -125,6 +125,7 @@ def __init__(self, deterministic: bool = True): ties |= pynutil.add_weight(alt_ties, -0.001) ties |= pynutil.add_weight(pynini.cross("4", "förtio"), -0.001) ties |= pynutil.add_weight(pynini.cross("4", "förti"), -0.001) + ties |= pynutil.add_weight(pynini.cross("2", "tju"), -0.001) # Any double digit graph_tens = teen @@ -328,6 +329,8 @@ def __init__(self, deterministic: bool = True): digit + insert_space + ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ graph_tens, -0.001 ) self.two_digits_read |= pynutil.add_weight(digit + insert_space + digit, -0.001) + self.any_read_digit |= self.two_digits_read + self.any_read_digit |= self.three_digits_read self.graph |= zero From 664f512d246b25a6fa9c9c7b2c17229b5c189690 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Sun, 9 Apr 2023 19:00:28 +0000 Subject: [PATCH 3/5] whitespace fixes Signed-off-by: Jim O'Regan --- nemo_text_processing/text_normalization/sv/taggers/decimal.py | 4 ++-- nemo_text_processing/text_normalization/sv/taggers/measure.py | 2 +- nemo_text_processing/text_normalization/sv/taggers/money.py | 2 +- nemo_text_processing/text_normalization/sv/taggers/time.py | 2 +- .../text_normalization/sv/taggers/tokenize_and_classify.py | 4 ++-- .../sv/taggers/tokenize_and_classify_with_audio.py | 4 ++-- nemo_text_processing/text_normalization/sv/utils.py | 2 +- .../text_normalization/sv/verbalizers/measure.py | 2 +- .../text_normalization/sv/verbalizers/verbalize_final.py | 1 - 9 files changed, 11 insertions(+), 12 deletions(-) diff --git a/nemo_text_processing/text_normalization/sv/taggers/decimal.py b/nemo_text_processing/text_normalization/sv/taggers/decimal.py index 1e1cb94c9..63111b104 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/sv/taggers/decimal.py @@ -34,7 +34,7 @@ def get_quantity( e.g. 1 miljon -> integer_part: "en" quantity: "miljon" e.g. 1,5 miljoner -> integer_part: "en" fractional_part: "fem" quantity: "miljoner" - Args: + Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ @@ -112,7 +112,7 @@ def get_quantity( class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. + Finite state transducer for classifying decimal, e.g. -12,5006 biljon -> decimal { negative: "true" integer_part: "tolv" fractional_part: "fem noll noll sex" quantity: "biljon" } 1 biljon -> decimal { integer_part: "en" quantity: "biljon" } diff --git a/nemo_text_processing/text_normalization/sv/taggers/measure.py b/nemo_text_processing/text_normalization/sv/taggers/measure.py index 28b98221b..0b625c678 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/measure.py +++ b/nemo_text_processing/text_normalization/sv/taggers/measure.py @@ -29,7 +29,7 @@ class MeasureFst(GraphFst): """ - Finite state transducer for classifying measure, suppletive aware, e.g. + Finite state transducer for classifying measure, suppletive aware, e.g. -12kg -> measure { negative: "true" cardinal { integer: "tolv" } units: "kilogram" } 1kg -> measure { cardinal { integer: "ett" } units: "kilogram" } ,5kg -> measure { decimal { fractional_part: "fem" } units: "kilogram" } diff --git a/nemo_text_processing/text_normalization/sv/taggers/money.py b/nemo_text_processing/text_normalization/sv/taggers/money.py index 29e53450b..9caaf385d 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/money.py +++ b/nemo_text_processing/text_normalization/sv/taggers/money.py @@ -35,7 +35,7 @@ class MoneyFst(GraphFst): """ - Finite state transducer for classifying money, suppletive aware, e.g. + Finite state transducer for classifying money, suppletive aware, e.g. $12,05 -> money { integer_part: "tolv" currency_maj: "dollar" fractional_part: "fem" currency_min: "cent" preserve_order: true } $12,0500 -> money { integer_part: "tolv" currency_maj: "dollar" fractional_part: "fem" currency_min: "cent" preserve_order: true } $1 -> money { currency_maj: "dollar" integer_part: "en" } diff --git a/nemo_text_processing/text_normalization/sv/taggers/time.py b/nemo_text_processing/text_normalization/sv/taggers/time.py index a9ad9064f..d89182bc4 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/time.py +++ b/nemo_text_processing/text_normalization/sv/taggers/time.py @@ -40,7 +40,7 @@ class TimeFst(GraphFst): 02:00 -> time { hours: "två" } 2:00 -> time { hours: "två" } 10:00:05 e.m. -> time { hours: "tio" minutes: "noll" seconds: "fem" suffix: "eftermiddag" } - + Args: cardinal: CardinalFst deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify.py index bcc68b118..1a9637c91 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify.py @@ -44,9 +44,9 @@ class ClassifyFst(GraphFst): """ Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. - For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify_with_audio.py b/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify_with_audio.py index 8ef77af97..f5c588647 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify_with_audio.py +++ b/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify_with_audio.py @@ -58,9 +58,9 @@ class ClassifyFst(GraphFst): """ Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. - For deployment, this grammar will be compiled and exported to OpenFst Finate State Archive (FAR) File. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archive (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/sv/utils.py b/nemo_text_processing/text_normalization/sv/utils.py index 6d20e29e0..332330921 100644 --- a/nemo_text_processing/text_normalization/sv/utils.py +++ b/nemo_text_processing/text_normalization/sv/utils.py @@ -22,7 +22,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/measure.py b/nemo_text_processing/text_normalization/sv/verbalizers/measure.py index 4ca928008..0af155d2b 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/measure.py @@ -23,7 +23,7 @@ class MeasureFst(GraphFst): measure { negative: "true" cardinal { integer: "twelve" } units: "kilograms" } -> minus twelve kilograms measure { decimal { integer_part: "twelve" fractional_part: "five" } units: "kilograms" } -> twelve point five kilograms tokens { measure { units: "covid" decimal { integer_part: "nineteen" fractional_part: "five" } } } -> covid nineteen point five - + Args: decimal: DecimalFst cardinal: CardinalFst diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/sv/verbalizers/verbalize_final.py index 6cfc0e7c4..647bfe6b6 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/verbalize_final.py @@ -49,7 +49,6 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ self.fst = pynini.Far(far_file, mode="r")["verbalize"] logging.info(f'VerbalizeFinalFst graph was restored from {far_file}.') else: - verbalize = VerbalizeFst(deterministic=deterministic).fst word = WordFst(deterministic=deterministic).fst types = verbalize | word From d629c7361f66ea281658c54765d0e500e3f9df4d Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Sun, 9 Apr 2023 19:00:45 +0000 Subject: [PATCH 4/5] also fix in the verbaliser Signed-off-by: Jim O'Regan --- .../text_normalization/sv/verbalizers/telephone.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py b/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py index 7aa5bf334..2eb79a759 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py @@ -41,13 +41,6 @@ def __init__(self, deterministic: bool = True): optional_country_code = pynini.closure(country_code + delete_space + insert_space, 0, 1,) - prompt_part = ( - pynutil.delete("prompt: \"") - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynini.closure(pynutil.add_weight(pynutil.delete(" "), -0.0001), 0, 1) - + pynutil.delete("\"") - ) + NEMO_SPACE - number_part = ( pynutil.delete("number_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) @@ -65,9 +58,6 @@ def __init__(self, deterministic: bool = True): 1, ) - graph = pynini.union( - prompt_part + optional_country_code + number_part + optional_extension, - optional_country_code + number_part + optional_extension, - ) + graph = pynini.union(optional_country_code + number_part + optional_extension) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() From 9ba21f3f8555db004d60d5bf009e2f9b7afd72a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jim=20O=E2=80=99Regan?= Date: Sun, 9 Apr 2023 21:06:17 +0200 Subject: [PATCH 5/5] Update Jenkinsfile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jim O’Regan --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 1337deee1..6a7f14d96 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -20,7 +20,7 @@ pipeline { PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0' RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0' - SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-09-23-0' + SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-09-23-0' ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'