NVIDIA · zoobereq · Jul 22, 2024
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -13,7 +13,7 @@ pipeline {
 
     AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0'
     DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0'
-    EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-15-24-0'
+    EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-06-24-0'
     ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0'
     ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-2'
     FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'

diff --git a/nemo_text_processing/text_normalization/en/data/electronic/domain.tsv b/nemo_text_processing/text_normalization/en/data/electronic/domain.tsv
@@ -9,13 +9,4 @@
 .ru	dot RU
 .de	dot DE
 .it	dot IT
-.jpg	dot jpeg
-.info	dot info
-.biz	dot biz
-.edu	dot edu
-.gov	dot gov
-.mil	dot mil
-.us	dot us
-.pl	dot pl
-.ca	dot ca
-.au	dot au
+.jpg	dot jpeg
diff --git a/nemo_text_processing/text_normalization/en/graph_utils.py b/nemo_text_processing/text_normalization/en/graph_utils.py
@@ -35,9 +35,9 @@
 NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize()
 NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
 NEMO_HEX = pynini.union(*string.hexdigits).optimize()
-NEMO_NON_BREAKING_SPACE = "\u00A0"
+NEMO_NON_BREAKING_SPACE = u"\u00A0"
 NEMO_SPACE = " "
-NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize()
+NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize()
 NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
 NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()
 
@@ -79,36 +79,20 @@
 delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
 delete_preserve_order = pynini.closure(
     pynutil.delete(" preserve_order: true")
-    | (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"'))
+    | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\""))
 )
 
-# Common string literals; expand as you see fit.
-username_string = "username"
-double_quotes = '"'
-domain_string = "domain"
-protocol_string = "protocol"
-slash = "/"
-double_slash = "//"
-triple_slash = "///"
-file = "file"
-period = "."
-at = "@"
-colon = ":"
-https = "https"
-http = "http"
-www = "www"
-
 suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
 # _v = pynini.union("a", "e", "i", "o", "u")
 _c = pynini.union(
-    "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z",
+    "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z"
 )
 _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies")
 _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es")
 _s = NEMO_SIGMA + pynutil.insert("s")
 
 graph_plural = plurals._priority_union(
-    suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA,
+    suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA
 ).optimize()
 
 SINGULAR_TO_PLURAL = graph_plural
@@ -123,8 +107,8 @@
 
 
 def capitalized_input_graph(
-    graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None,
-) -> "pynini.FstLike":
+    graph: 'pynini.FstLike', original_graph_weight: float = None, capitalized_graph_weight: float = None
+) -> 'pynini.FstLike':
     """
     Allow graph input to be capitalized, e.g. for ITN)
 
@@ -145,7 +129,7 @@ def capitalized_input_graph(
     return graph
 
 
-def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]):
+def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']):
     """
     Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.
 
@@ -157,7 +141,7 @@ def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]):
     for rule, graph in graphs.items():
         exporter[rule] = graph.optimize()
     exporter.close()
-    logger.info(f"Created {file_name}")
+    logger.info(f'Created {file_name}')
 
 
 def get_plurals(fst):
@@ -184,7 +168,7 @@ def get_singulars(fst):
     return PLURAL_TO_SINGULAR @ fst
 
 
-def convert_space(fst) -> "pynini.FstLike":
+def convert_space(fst) -> 'pynini.FstLike':
     """
     Converts space to nonbreaking space.
     Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
@@ -207,7 +191,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED):
             written_capitalized = written[0].upper() + written[1:]
             additional_labels.extend(
                 [
-                    [written_capitalized, spoken.capitalize(),],  # first letter capitalized
+                    [written_capitalized, spoken.capitalize()],  # first letter capitalized
                     [
                         written_capitalized,
                         spoken.upper().replace(" AND ", " and "),
@@ -221,7 +205,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED):
                 logger.debug(f"This is weight {weight}")
                 if len(weight) == 0:
                     additional_labels.extend(
-                        [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],]
+                        [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]]
                     )
                 else:
                     additional_labels.extend(
@@ -253,7 +237,7 @@ def __init__(self, name: str, kind: str, deterministic: bool = True):
         self._fst = None
         self.deterministic = deterministic
 
-        self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far")
+        self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far')
         if self.far_exist():
             self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst()
 
@@ -264,14 +248,14 @@ def far_exist(self) -> bool:
         return self.far_path.exists()
 
     @property
-    def fst(self) -> "pynini.FstLike":
+    def fst(self) -> 'pynini.FstLike':
         return self._fst
 
     @fst.setter
     def fst(self, fst):
         self._fst = fst
 
-    def add_tokens(self, fst) -> "pynini.FstLike":
+    def add_tokens(self, fst) -> 'pynini.FstLike':
         """
         Wraps class name around to given fst
 
@@ -283,7 +267,7 @@ def add_tokens(self, fst) -> "pynini.FstLike":
         """
         return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")
 
-    def delete_tokens(self, fst) -> "pynini.FstLike":
+    def delete_tokens(self, fst) -> 'pynini.FstLike':
         """
         Deletes class name wrap around output of given fst
 
@@ -302,4 +286,4 @@ def delete_tokens(self, fst) -> "pynini.FstLike":
             + delete_space
             + pynutil.delete("}")
         )
-        return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA)
+        return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)
diff --git a/nemo_text_processing/text_normalization/en/taggers/electronic.py b/nemo_text_processing/text_normalization/en/taggers/electronic.py
@@ -16,31 +16,17 @@
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.en.graph_utils import (  # common string literals
+from nemo_text_processing.text_normalization.en.graph_utils import (
     MIN_NEG_WEIGHT,
     NEMO_ALPHA,
     NEMO_DIGIT,
     NEMO_NOT_SPACE,
     NEMO_SIGMA,
-    NEMO_SPACE,
     NEMO_UPPER,
     TO_UPPER,
     GraphFst,
-    at,
-    colon,
-    domain_string,
-    double_quotes,
-    double_slash,
-    file,
     get_abs_path,
-    http,
-    https,
-    period,
-    protocol_string,
-    slash,
-    triple_slash,
-    username_string,
-    www,
+    insert_space,
 )
 
 
@@ -61,19 +47,19 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
         if deterministic:
             numbers = NEMO_DIGIT
         else:
-            numbers = pynutil.insert(NEMO_SPACE) + cardinal.long_numbers + pynutil.insert(NEMO_SPACE)
+            numbers = pynutil.insert(" ") + cardinal.long_numbers + pynutil.insert(" ")
 
-        cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT,)
+        cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT)
 
         accepted_symbols = pynini.project(pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input")
         accepted_common_domains = pynini.project(
             pynini.string_file(get_abs_path("data/electronic/domain.tsv")), "input"
         )
 
-        dict_words = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/words.tsv")), MIN_NEG_WEIGHT,)
+        dict_words = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/words.tsv")), MIN_NEG_WEIGHT)
 
         dict_words_without_delimiter = dict_words + pynini.closure(
-            pynutil.add_weight(pynutil.insert(NEMO_SPACE) + dict_words, MIN_NEG_WEIGHT), 1,
+            pynutil.add_weight(pynutil.insert(" ") + dict_words, MIN_NEG_WEIGHT), 1
         )
         dict_words_graph = dict_words_without_delimiter | dict_words
 
@@ -90,75 +76,59 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
             NEMO_ALPHA | numbers | accepted_symbols | dict_words_graph
         )
 
-        username = (
-            pynutil.insert(username_string + colon + NEMO_SPACE + double_quotes)
-            + username
-            + pynutil.insert(double_quotes)
-            + pynini.cross(at, NEMO_SPACE)
-        )
+        username = pynutil.insert("username: \"") + username + pynutil.insert("\"") + pynini.cross('@', ' ')
 
         domain_graph = all_accepted_symbols_start + pynini.closure(
             all_accepted_symbols_end | pynutil.add_weight(accepted_common_domains, MIN_NEG_WEIGHT)
         )
 
-        protocol_symbols = pynini.closure((graph_symbols | pynini.cross(colon, "colon")) + pynutil.insert(NEMO_SPACE))
-        protocol_start = (
-            pynini.cross(https, (https.upper() + NEMO_SPACE)) | pynini.cross(http, (http.upper() + NEMO_SPACE))
-        ) + (pynini.accep(colon + double_slash) @ protocol_symbols)
-        protocol_file_start = (
-            pynini.accep(file) + pynutil.insert(NEMO_SPACE) + (pynini.accep(colon + triple_slash) @ protocol_symbols)
+        protocol_symbols = pynini.closure((graph_symbols | pynini.cross(":", "colon")) + pynutil.insert(" "))
+        protocol_start = (pynini.cross("https", "HTTPS ") | pynini.cross("http", "HTTP ")) + (
+            pynini.accep("://") @ protocol_symbols
         )
+        protocol_file_start = pynini.accep("file") + insert_space + (pynini.accep(":///") @ protocol_symbols)
 
-        protocol_end = pynutil.add_weight(
-            pynini.cross(www, (www.upper() + NEMO_SPACE)) + pynini.accep(period) @ protocol_symbols, -1000,
-        )
+        protocol_end = pynutil.add_weight(pynini.cross("www", "WWW ") + pynini.accep(".") @ protocol_symbols, -1000)
         protocol = protocol_file_start | protocol_start | protocol_end | (protocol_start + protocol_end)
 
         domain_graph_with_class_tags = (
-            pynutil.insert(domain_string + colon + NEMO_SPACE + double_quotes)
+            pynutil.insert("domain: \"")
             + pynini.compose(
-                NEMO_ALPHA + pynini.closure(NEMO_NOT_SPACE) + (NEMO_ALPHA | NEMO_DIGIT | pynini.accep(slash)),
+                NEMO_ALPHA + pynini.closure(NEMO_NOT_SPACE) + (NEMO_ALPHA | NEMO_DIGIT | pynini.accep("/")),
                 domain_graph,
             ).optimize()
-            + pynutil.insert(double_quotes)
+            + pynutil.insert("\"")
         )
 
-        protocol = (
-            pynutil.insert(protocol_string + colon + NEMO_SPACE + double_quotes)
-            + pynutil.add_weight(protocol, MIN_NEG_WEIGHT)
-            + pynutil.insert(double_quotes)
-        )
+        protocol = pynutil.insert("protocol: \"") + pynutil.add_weight(protocol, MIN_NEG_WEIGHT) + pynutil.insert("\"")
         # email
         graph = pynini.compose(
-            NEMO_SIGMA + pynini.accep(at) + NEMO_SIGMA + pynini.accep(period) + NEMO_SIGMA,
+            NEMO_SIGMA + pynini.accep("@") + NEMO_SIGMA + pynini.accep(".") + NEMO_SIGMA,
             username + domain_graph_with_class_tags,
         )
 
         # abc.com, abc.com/123-sm
         # when only domain, make sure it starts and end with NEMO_ALPHA
         graph |= (
-            pynutil.insert(domain_string + colon + NEMO_SPACE + double_quotes)
+            pynutil.insert("domain: \"")
             + pynini.compose(
-                NEMO_ALPHA
-                + pynini.closure(NEMO_NOT_SPACE)
-                + accepted_common_domains
-                + pynini.closure(pynini.difference(NEMO_NOT_SPACE, pynini.accep(period))),
+                NEMO_ALPHA + pynini.closure(NEMO_NOT_SPACE) + accepted_common_domains + pynini.closure(NEMO_NOT_SPACE),
                 domain_graph,
             ).optimize()
-            + pynutil.insert(double_quotes)
+            + pynutil.insert("\"")
         )
         # www.abc.com/sdafsdf, or https://www.abc.com/asdfad or www.abc.abc/asdfad
-        graph |= protocol + pynutil.insert(NEMO_SPACE) + domain_graph_with_class_tags
+        graph |= protocol + pynutil.insert(" ") + domain_graph_with_class_tags
 
         if deterministic:
             # credit card cues
             numbers = pynini.closure(NEMO_DIGIT, 4, 16)
             cc_phrases = (
-                pynutil.insert(protocol_string + colon + NEMO_SPACE + double_quotes)
+                pynutil.insert("protocol: \"")
                 + cc_cues
-                + pynutil.insert(double_quotes + NEMO_SPACE + domain_string + colon + NEMO_SPACE + double_quotes)
+                + pynutil.insert("\" domain: \"")
                 + numbers
-                + pynutil.insert(double_quotes)
+                + pynutil.insert("\"")
             )
             graph |= cc_phrases
 

diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt
@@ -4,7 +4,7 @@ a@hotmail.fr~a at hotmail dot FR
 a@hotmail.it~a at hotmail dot IT
 a@aol.it~a at aol dot IT
 a@msn.it~a at msn dot IT
-cdf@abc.edu~cdf at abc dot edu
+cdf@abc.edu~cdf at abc dot EDU
 abc@gmail.abc~abc at gmail dot ABC
 abc@abc.com~abc at abc dot com
 asdf123@abc.com~asdf one two three at abc dot com
@@ -38,6 +38,4 @@ rtxprohelp@exchange.nvidia.com~RTX pro help at exchange dot NVIDIA dot com
 enterpriseservices@nvidia.com~enterprise services at NVIDIA dot com
 enterprise-services@nvidia.com~enterprise dash services at NVIDIA dot com
 https://www.nvidia.com/dgx-basepod/~HTTPS colon slash slash WWW dot NVIDIA dot com slash DGX dash BASEPOD slash
-i can use your card ending in 8876~i can use your card ending in eight eight seven six
-here is mail.nasa.gov.~here is mail dot nasa dot gov.
-check us out at some_university.edu.~check us out at some underscore university dot edu.
+i can use your card ending in 8876~i can use your card ending in eight eight seven six