diff --git a/Jenkinsfile b/Jenkinsfile index d11d51d21..e1c589a23 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -13,7 +13,7 @@ pipeline { AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0' DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0' - EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-15-24-0' + EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-06-24-0' ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0' ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-2' FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0' diff --git a/nemo_text_processing/text_normalization/en/data/electronic/domain.tsv b/nemo_text_processing/text_normalization/en/data/electronic/domain.tsv index 5878a8c77..0e7042c5f 100644 --- a/nemo_text_processing/text_normalization/en/data/electronic/domain.tsv +++ b/nemo_text_processing/text_normalization/en/data/electronic/domain.tsv @@ -9,13 +9,4 @@ .ru dot RU .de dot DE .it dot IT -.jpg dot jpeg -.info dot info -.biz dot biz -.edu dot edu -.gov dot gov -.mil dot mil -.us dot us -.pl dot pl -.ca dot ca -.au dot au \ No newline at end of file +.jpg dot jpeg \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/en/graph_utils.py b/nemo_text_processing/text_normalization/en/graph_utils.py index 3d319163e..239e1b282 100644 --- a/nemo_text_processing/text_normalization/en/graph_utils.py +++ b/nemo_text_processing/text_normalization/en/graph_utils.py @@ -35,9 +35,9 @@ NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() NEMO_HEX = pynini.union(*string.hexdigits).optimize() -NEMO_NON_BREAKING_SPACE = "\u00A0" +NEMO_NON_BREAKING_SPACE = u"\u00A0" NEMO_SPACE = " " -NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize() +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() @@ -79,36 +79,20 @@ delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") delete_preserve_order = pynini.closure( pynutil.delete(" preserve_order: true") - | (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"')) + | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"")) ) -# Common string literals; expand as you see fit. -username_string = "username" -double_quotes = '"' -domain_string = "domain" -protocol_string = "protocol" -slash = "/" -double_slash = "//" -triple_slash = "///" -file = "file" -period = "." -at = "@" -colon = ":" -https = "https" -http = "http" -www = "www" - suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) # _v = pynini.union("a", "e", "i", "o", "u") _c = pynini.union( - "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z", + "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z" ) _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") _s = NEMO_SIGMA + pynutil.insert("s") graph_plural = plurals._priority_union( - suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA, + suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA ).optimize() SINGULAR_TO_PLURAL = graph_plural @@ -123,8 +107,8 @@ def capitalized_input_graph( - graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None, -) -> "pynini.FstLike": + graph: 'pynini.FstLike', original_graph_weight: float = None, capitalized_graph_weight: float = None +) -> 'pynini.FstLike': """ Allow graph input to be capitalized, e.g. for ITN) @@ -145,7 +129,7 @@ def capitalized_input_graph( return graph -def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): +def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): """ Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. @@ -157,7 +141,7 @@ def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): for rule, graph in graphs.items(): exporter[rule] = graph.optimize() exporter.close() - logger.info(f"Created {file_name}") + logger.info(f'Created {file_name}') def get_plurals(fst): @@ -184,7 +168,7 @@ def get_singulars(fst): return PLURAL_TO_SINGULAR @ fst -def convert_space(fst) -> "pynini.FstLike": +def convert_space(fst) -> 'pynini.FstLike': """ Converts space to nonbreaking space. Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" @@ -207,7 +191,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): written_capitalized = written[0].upper() + written[1:] additional_labels.extend( [ - [written_capitalized, spoken.capitalize(),], # first letter capitalized + [written_capitalized, spoken.capitalize()], # first letter capitalized [ written_capitalized, spoken.upper().replace(" AND ", " and "), @@ -221,7 +205,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): logger.debug(f"This is weight {weight}") if len(weight) == 0: additional_labels.extend( - [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],] + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]] ) else: additional_labels.extend( @@ -253,7 +237,7 @@ def __init__(self, name: str, kind: str, deterministic: bool = True): self._fst = None self.deterministic = deterministic - self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far") + self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far') if self.far_exist(): self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() @@ -264,14 +248,14 @@ def far_exist(self) -> bool: return self.far_path.exists() @property - def fst(self) -> "pynini.FstLike": + def fst(self) -> 'pynini.FstLike': return self._fst @fst.setter def fst(self, fst): self._fst = fst - def add_tokens(self, fst) -> "pynini.FstLike": + def add_tokens(self, fst) -> 'pynini.FstLike': """ Wraps class name around to given fst @@ -283,7 +267,7 @@ def add_tokens(self, fst) -> "pynini.FstLike": """ return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") - def delete_tokens(self, fst) -> "pynini.FstLike": + def delete_tokens(self, fst) -> 'pynini.FstLike': """ Deletes class name wrap around output of given fst @@ -302,4 +286,4 @@ def delete_tokens(self, fst) -> "pynini.FstLike": + delete_space + pynutil.delete("}") ) - return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA) + return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/en/taggers/electronic.py b/nemo_text_processing/text_normalization/en/taggers/electronic.py index c3d0a1003..4d6f0e6ce 100644 --- a/nemo_text_processing/text_normalization/en/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/en/taggers/electronic.py @@ -16,31 +16,17 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import ( # common string literals +from nemo_text_processing.text_normalization.en.graph_utils import ( MIN_NEG_WEIGHT, NEMO_ALPHA, NEMO_DIGIT, NEMO_NOT_SPACE, NEMO_SIGMA, - NEMO_SPACE, NEMO_UPPER, TO_UPPER, GraphFst, - at, - colon, - domain_string, - double_quotes, - double_slash, - file, get_abs_path, - http, - https, - period, - protocol_string, - slash, - triple_slash, - username_string, - www, + insert_space, ) @@ -61,19 +47,19 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): if deterministic: numbers = NEMO_DIGIT else: - numbers = pynutil.insert(NEMO_SPACE) + cardinal.long_numbers + pynutil.insert(NEMO_SPACE) + numbers = pynutil.insert(" ") + cardinal.long_numbers + pynutil.insert(" ") - cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT,) + cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT) accepted_symbols = pynini.project(pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input") accepted_common_domains = pynini.project( pynini.string_file(get_abs_path("data/electronic/domain.tsv")), "input" ) - dict_words = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/words.tsv")), MIN_NEG_WEIGHT,) + dict_words = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/words.tsv")), MIN_NEG_WEIGHT) dict_words_without_delimiter = dict_words + pynini.closure( - pynutil.add_weight(pynutil.insert(NEMO_SPACE) + dict_words, MIN_NEG_WEIGHT), 1, + pynutil.add_weight(pynutil.insert(" ") + dict_words, MIN_NEG_WEIGHT), 1 ) dict_words_graph = dict_words_without_delimiter | dict_words @@ -90,75 +76,59 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): NEMO_ALPHA | numbers | accepted_symbols | dict_words_graph ) - username = ( - pynutil.insert(username_string + colon + NEMO_SPACE + double_quotes) - + username - + pynutil.insert(double_quotes) - + pynini.cross(at, NEMO_SPACE) - ) + username = pynutil.insert("username: \"") + username + pynutil.insert("\"") + pynini.cross('@', ' ') domain_graph = all_accepted_symbols_start + pynini.closure( all_accepted_symbols_end | pynutil.add_weight(accepted_common_domains, MIN_NEG_WEIGHT) ) - protocol_symbols = pynini.closure((graph_symbols | pynini.cross(colon, "colon")) + pynutil.insert(NEMO_SPACE)) - protocol_start = ( - pynini.cross(https, (https.upper() + NEMO_SPACE)) | pynini.cross(http, (http.upper() + NEMO_SPACE)) - ) + (pynini.accep(colon + double_slash) @ protocol_symbols) - protocol_file_start = ( - pynini.accep(file) + pynutil.insert(NEMO_SPACE) + (pynini.accep(colon + triple_slash) @ protocol_symbols) + protocol_symbols = pynini.closure((graph_symbols | pynini.cross(":", "colon")) + pynutil.insert(" ")) + protocol_start = (pynini.cross("https", "HTTPS ") | pynini.cross("http", "HTTP ")) + ( + pynini.accep("://") @ protocol_symbols ) + protocol_file_start = pynini.accep("file") + insert_space + (pynini.accep(":///") @ protocol_symbols) - protocol_end = pynutil.add_weight( - pynini.cross(www, (www.upper() + NEMO_SPACE)) + pynini.accep(period) @ protocol_symbols, -1000, - ) + protocol_end = pynutil.add_weight(pynini.cross("www", "WWW ") + pynini.accep(".") @ protocol_symbols, -1000) protocol = protocol_file_start | protocol_start | protocol_end | (protocol_start + protocol_end) domain_graph_with_class_tags = ( - pynutil.insert(domain_string + colon + NEMO_SPACE + double_quotes) + pynutil.insert("domain: \"") + pynini.compose( - NEMO_ALPHA + pynini.closure(NEMO_NOT_SPACE) + (NEMO_ALPHA | NEMO_DIGIT | pynini.accep(slash)), + NEMO_ALPHA + pynini.closure(NEMO_NOT_SPACE) + (NEMO_ALPHA | NEMO_DIGIT | pynini.accep("/")), domain_graph, ).optimize() - + pynutil.insert(double_quotes) + + pynutil.insert("\"") ) - protocol = ( - pynutil.insert(protocol_string + colon + NEMO_SPACE + double_quotes) - + pynutil.add_weight(protocol, MIN_NEG_WEIGHT) - + pynutil.insert(double_quotes) - ) + protocol = pynutil.insert("protocol: \"") + pynutil.add_weight(protocol, MIN_NEG_WEIGHT) + pynutil.insert("\"") # email graph = pynini.compose( - NEMO_SIGMA + pynini.accep(at) + NEMO_SIGMA + pynini.accep(period) + NEMO_SIGMA, + NEMO_SIGMA + pynini.accep("@") + NEMO_SIGMA + pynini.accep(".") + NEMO_SIGMA, username + domain_graph_with_class_tags, ) # abc.com, abc.com/123-sm # when only domain, make sure it starts and end with NEMO_ALPHA graph |= ( - pynutil.insert(domain_string + colon + NEMO_SPACE + double_quotes) + pynutil.insert("domain: \"") + pynini.compose( - NEMO_ALPHA - + pynini.closure(NEMO_NOT_SPACE) - + accepted_common_domains - + pynini.closure(pynini.difference(NEMO_NOT_SPACE, pynini.accep(period))), + NEMO_ALPHA + pynini.closure(NEMO_NOT_SPACE) + accepted_common_domains + pynini.closure(NEMO_NOT_SPACE), domain_graph, ).optimize() - + pynutil.insert(double_quotes) + + pynutil.insert("\"") ) # www.abc.com/sdafsdf, or https://www.abc.com/asdfad or www.abc.abc/asdfad - graph |= protocol + pynutil.insert(NEMO_SPACE) + domain_graph_with_class_tags + graph |= protocol + pynutil.insert(" ") + domain_graph_with_class_tags if deterministic: # credit card cues numbers = pynini.closure(NEMO_DIGIT, 4, 16) cc_phrases = ( - pynutil.insert(protocol_string + colon + NEMO_SPACE + double_quotes) + pynutil.insert("protocol: \"") + cc_cues - + pynutil.insert(double_quotes + NEMO_SPACE + domain_string + colon + NEMO_SPACE + double_quotes) + + pynutil.insert("\" domain: \"") + numbers - + pynutil.insert(double_quotes) + + pynutil.insert("\"") ) graph |= cc_phrases diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt index 05831ad1c..9892a9fe5 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt @@ -4,7 +4,7 @@ a@hotmail.fr~a at hotmail dot FR a@hotmail.it~a at hotmail dot IT a@aol.it~a at aol dot IT a@msn.it~a at msn dot IT -cdf@abc.edu~cdf at abc dot edu +cdf@abc.edu~cdf at abc dot EDU abc@gmail.abc~abc at gmail dot ABC abc@abc.com~abc at abc dot com asdf123@abc.com~asdf one two three at abc dot com @@ -38,6 +38,4 @@ rtxprohelp@exchange.nvidia.com~RTX pro help at exchange dot NVIDIA dot com enterpriseservices@nvidia.com~enterprise services at NVIDIA dot com enterprise-services@nvidia.com~enterprise dash services at NVIDIA dot com https://www.nvidia.com/dgx-basepod/~HTTPS colon slash slash WWW dot NVIDIA dot com slash DGX dash BASEPOD slash -i can use your card ending in 8876~i can use your card ending in eight eight seven six -here is mail.nasa.gov.~here is mail dot nasa dot gov. -check us out at some_university.edu.~check us out at some underscore university dot edu. \ No newline at end of file +i can use your card ending in 8876~i can use your card ending in eight eight seven six \ No newline at end of file