NVIDIA · ekmb · May 11, 2023 · Apr 9, 2023 · Apr 9, 2023 · Apr 9, 2023
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -20,7 +20,7 @@ pipeline {
     PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
     RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
     VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
-    SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-09-23-0'
+    SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-09-23-0'
     ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
     DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
 

diff --git a/nemo_text_processing/text_normalization/sv/taggers/cardinal.py b/nemo_text_processing/text_normalization/sv/taggers/cardinal.py
@@ -125,6 +125,7 @@ def __init__(self, deterministic: bool = True):
             ties |= pynutil.add_weight(alt_ties, -0.001)
             ties |= pynutil.add_weight(pynini.cross("4", "förtio"), -0.001)
             ties |= pynutil.add_weight(pynini.cross("4", "förti"), -0.001)
+            ties |= pynutil.add_weight(pynini.cross("2", "tju"), -0.001)
 
         # Any double digit
         graph_tens = teen
@@ -328,6 +329,8 @@ def __init__(self, deterministic: bool = True):
                 digit + insert_space + ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ graph_tens, -0.001
             )
             self.two_digits_read |= pynutil.add_weight(digit + insert_space + digit, -0.001)
+            self.any_read_digit |= self.two_digits_read
+            self.any_read_digit |= self.three_digits_read
 
         self.graph |= zero
 

diff --git a/nemo_text_processing/text_normalization/sv/taggers/decimal.py b/nemo_text_processing/text_normalization/sv/taggers/decimal.py
@@ -34,7 +34,7 @@ def get_quantity(
     e.g. 1 miljon -> integer_part: "en" quantity: "miljon"
     e.g. 1,5 miljoner -> integer_part: "en" fractional_part: "fem" quantity: "miljoner"
 
-    Args: 
+    Args:
         decimal: decimal FST
         cardinal_up_to_hundred: cardinal FST
     """
@@ -112,7 +112,7 @@ def get_quantity(
 
 class DecimalFst(GraphFst):
     """
-    Finite state transducer for classifying decimal, e.g. 
+    Finite state transducer for classifying decimal, e.g.
         -12,5006 biljon -> decimal { negative: "true" integer_part: "tolv"  fractional_part: "fem noll noll sex" quantity: "biljon" }
         1 biljon -> decimal { integer_part: "en" quantity: "biljon" }
 

diff --git a/nemo_text_processing/text_normalization/sv/taggers/measure.py b/nemo_text_processing/text_normalization/sv/taggers/measure.py
@@ -29,7 +29,7 @@
 
 class MeasureFst(GraphFst):
     """
-    Finite state transducer for classifying measure, suppletive aware, e.g. 
+    Finite state transducer for classifying measure, suppletive aware, e.g.
         -12kg -> measure { negative: "true" cardinal { integer: "tolv" } units: "kilogram" }
         1kg -> measure { cardinal { integer: "ett" } units: "kilogram" }
         ,5kg -> measure { decimal { fractional_part: "fem" } units: "kilogram" }

diff --git a/nemo_text_processing/text_normalization/sv/taggers/money.py b/nemo_text_processing/text_normalization/sv/taggers/money.py
@@ -35,7 +35,7 @@
 
 class MoneyFst(GraphFst):
     """
-    Finite state transducer for classifying money, suppletive aware, e.g. 
+    Finite state transducer for classifying money, suppletive aware, e.g.
         $12,05 -> money { integer_part: "tolv" currency_maj: "dollar" fractional_part: "fem" currency_min: "cent" preserve_order: true }
         $12,0500 -> money { integer_part: "tolv" currency_maj: "dollar" fractional_part: "fem" currency_min: "cent" preserve_order: true }
         $1 -> money { currency_maj: "dollar" integer_part: "en" }

diff --git a/nemo_text_processing/text_normalization/sv/taggers/telephone.py b/nemo_text_processing/text_normalization/sv/taggers/telephone.py
@@ -15,6 +15,7 @@
 
 import pynini
 from nemo_text_processing.text_normalization.en.graph_utils import (
+    NEMO_DIGIT,
     NEMO_SPACE,
     GraphFst,
     delete_extra_space,
@@ -42,7 +43,7 @@ class TelephoneFst(GraphFst):
         0XX-XXX XX
         0XXX-XX XX XX
         0XXX-XXX XX
-    
+
     See:
         https://en.wikipedia.org/wiki/National_conventions_for_writing_telephone_numbers#Sweden
         https://codegolf.stackexchange.com/questions/195787/format-a-swedish-phone-number
@@ -70,14 +71,23 @@ def __init__(self, deterministic: bool = True):
 
         special_numbers = pynini.string_file(get_abs_path("data/telephone/special_numbers.tsv"))
 
+        passable = pynini.union(":", ": ", " ")
+        prompt_pass = pynini.closure(pynutil.delete(passable) + insert_space, 0, 1)
         telephone_abbr = pynini.string_file(get_abs_path("data/telephone/telephone_abbr.tsv"))
+        telephone_abbr = telephone_abbr + prompt_pass
         telephone_prompt = pynini.string_file(get_abs_path("data/telephone/telephone_prompt.tsv"))
-        prompt = pynutil.insert("prompt: \"") + telephone_prompt + pynutil.insert("\"")
-        prompt |= pynutil.insert("prompt: \"") + telephone_abbr + pynutil.insert("\"")
-        prompt |= pynutil.insert("prompt: \"") + telephone_prompt + NEMO_SPACE + telephone_abbr + pynutil.insert("\"")
+        prompt_as_code = pynutil.insert("country_code: \"") + telephone_prompt + pynutil.insert("\"")
+        prompt_as_code |= pynutil.insert("country_code: \"") + telephone_abbr + pynutil.insert("\"")
+        prompt_as_code |= (
+            pynutil.insert("country_code: \"") + telephone_prompt + NEMO_SPACE + telephone_abbr + pynutil.insert("\"")
+        )
+        prompt_inner = telephone_prompt | telephone_abbr
+        prompt_inner |= telephone_prompt + NEMO_SPACE + telephone_abbr
 
-        country_code = pynini.closure(pynini.cross("+", "plus "), 0, 1) + one_two_or_three_digits
-        country_code = pynutil.insert("country_code: \"") + country_code + pynutil.insert("\"")
+        country = pynini.closure(pynini.cross("+", "plus "), 0, 1) + one_two_or_three_digits
+        country_code = pynutil.insert("country_code: \"") + country + pynutil.insert("\"")
+        country_code |= prompt_as_code
+        country_code |= pynutil.insert("country_code: \"") + prompt_inner + NEMO_SPACE + country + pynutil.insert("\"")
 
         opt_dash = pynini.closure(pynutil.delete("-"), 0, 1)
         area_part = zero_after_country_code + one_two_or_three_digits + opt_dash + add_separator
@@ -102,19 +112,33 @@ def __init__(self, deterministic: bool = True):
         prompt_pass = pynutil.delete(passable) + insert_space
 
         special_numbers = pynutil.insert("number_part: \"") + special_numbers + pynutil.insert("\"")
-        prompt = prompt + prompt_pass
         graph = pynini.union(
             country_code + ensure_space + number_part,
             country_code + ensure_space + number_part + ext_prompt + extension,
             number_part + ext_prompt + extension,
-            prompt + number_part,
-            prompt + special_numbers,
-            prompt + country_code + number_part,
-            prompt + country_code + number_part + ext_prompt + extension,
-            prompt + number_part + ext_prompt + extension,
+            country_code + number_part,
+            country_code + special_numbers,
+            country_code + number_part + ext_prompt + extension,
         )
         self.tel_graph = graph.optimize()
 
+        # No need to be so exact here, but better for ITN to have it
+        three_digit_area_code_digit_two = pynini.union("1", "2", "3", "4", "7")
+        three_digit_area_code_no_zero = (three_digit_area_code_digit_two + NEMO_DIGIT) @ cardinal.two_digits_read
+        three_digit_area_code = zero_space + three_digit_area_code_no_zero
+        four_digit_area_code_digit_two = pynini.union("5", "6", "9")
+        four_digit_area_code_no_zero = (four_digit_area_code_digit_two + NEMO_DIGIT) @ cardinal.three_digits_read
+        four_digit_area_code = zero_space + four_digit_area_code_no_zero
+        two_digit_area_code = "08" @ cardinal.two_digits_read
+        self.area_codes = two_digit_area_code | three_digit_area_code | four_digit_area_code
+        self.area_codes_no_zero = (
+            three_digit_area_code_no_zero | four_digit_area_code_no_zero | pynini.cross("8", "åtta")
+        )
+        country_code_lead = pynini.cross("+", "plus") | pynini.cross("00", "noll noll")
+        raw_country_codes = pynini.string_file(get_abs_path("data/telephone/country_codes.tsv"))
+        self.country_code = country_code_lead + insert_space + (raw_country_codes @ cardinal.any_read_digit)
+        self.country_plus_area_code = self.country_code + NEMO_SPACE + self.area_codes_no_zero
+
         # ip
         ip_prompts = pynini.string_file(get_abs_path("data/telephone/ip_prompt.tsv"))
         ip_graph = one_two_or_three_digits + (pynini.cross(".", " punkt ") + one_two_or_three_digits) ** 3

diff --git a/nemo_text_processing/text_normalization/sv/taggers/time.py b/nemo_text_processing/text_normalization/sv/taggers/time.py
@@ -40,7 +40,7 @@ class TimeFst(GraphFst):
         02:00 -> time { hours: "två" }
         2:00 -> time { hours: "två" }
         10:00:05 e.m. -> time { hours: "tio" minutes: "noll" seconds: "fem" suffix: "eftermiddag" }
-    
+
     Args:
         cardinal: CardinalFst
         deterministic: if True will provide a single transduction option,

diff --git a/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify.py
@@ -44,9 +44,9 @@
 class ClassifyFst(GraphFst):
     """
     Final class that composes all other classification grammars. This class can process an entire sentence including punctuation.
-    For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. 
+    For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File.
     More details to deployment at NeMo/tools/text_processing_deployment.
-    
+
     Args:
         input_case: accepting either "lower_cased" or "cased" input.
         deterministic: if True will provide a single transduction option,

diff --git a/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify_with_audio.py b/nemo_text_processing/text_normalization/sv/taggers/tokenize_and_classify_with_audio.py
@@ -58,9 +58,9 @@
 class ClassifyFst(GraphFst):
     """
     Final class that composes all other classification grammars. This class can process an entire sentence including punctuation.
-    For deployment, this grammar will be compiled and exported to OpenFst Finate State Archive (FAR) File. 
+    For deployment, this grammar will be compiled and exported to OpenFst Finate State Archive (FAR) File.
     More details to deployment at NeMo/tools/text_processing_deployment.
-    
+
     Args:
         input_case: accepting either "lower_cased" or "cased" input.
         deterministic: if True will provide a single transduction option,

diff --git a/nemo_text_processing/text_normalization/sv/utils.py b/nemo_text_processing/text_normalization/sv/utils.py
@@ -22,7 +22,7 @@ def get_abs_path(rel_path):
 
     Args:
         rel_path: relative path to this file
-        
+
     Returns absolute path
     """
     return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path

diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/measure.py b/nemo_text_processing/text_normalization/sv/verbalizers/measure.py
@@ -23,7 +23,7 @@ class MeasureFst(GraphFst):
         measure { negative: "true" cardinal { integer: "twelve" } units: "kilograms" } -> minus twelve kilograms
         measure { decimal { integer_part: "twelve" fractional_part: "five" } units: "kilograms" } -> twelve point five kilograms
         tokens { measure { units: "covid" decimal { integer_part: "nineteen"  fractional_part: "five" }  } } -> covid nineteen point five
-    
+
     Args:
         decimal: DecimalFst
         cardinal: CardinalFst

diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py b/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py
@@ -41,13 +41,6 @@ def __init__(self, deterministic: bool = True):
 
         optional_country_code = pynini.closure(country_code + delete_space + insert_space, 0, 1,)
 
-        prompt_part = (
-            pynutil.delete("prompt: \"")
-            + pynini.closure(NEMO_NOT_QUOTE, 1)
-            + pynini.closure(pynutil.add_weight(pynutil.delete(" "), -0.0001), 0, 1)
-            + pynutil.delete("\"")
-        ) + NEMO_SPACE
-
         number_part = (
             pynutil.delete("number_part: \"")
             + pynini.closure(NEMO_NOT_QUOTE, 1)
@@ -65,9 +58,6 @@ def __init__(self, deterministic: bool = True):
             1,
         )
 
-        graph = pynini.union(
-            prompt_part + optional_country_code + number_part + optional_extension,
-            optional_country_code + number_part + optional_extension,
-        )
+        graph = pynini.union(optional_country_code + number_part + optional_extension)
         delete_tokens = self.delete_tokens(graph)
         self.fst = delete_tokens.optimize()
diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/sv/verbalizers/verbalize_final.py
@@ -49,7 +49,6 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_
             self.fst = pynini.Far(far_file, mode="r")["verbalize"]
             logging.info(f'VerbalizeFinalFst graph was restored from {far_file}.')
         else:
-
             verbalize = VerbalizeFst(deterministic=deterministic).fst
             word = WordFst(deterministic=deterministic).fst
             types = verbalize | word
-Original file line number
+Diff line change
@@ Expand Up / @@ -22,7 +22,7 @@ def get_abs_path(rel_path): @@
         Args:
             rel_path: relative path to this file
         Returns absolute path
         """
         return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path
@@ Expand Down @@