Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ pipeline {

AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0'
DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0'
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-15-24-0'
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-06-24-0'
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0'
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-2'
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,4 @@
.ru dot RU
.de dot DE
.it dot IT
.jpg dot jpeg
.info dot info
.biz dot biz
.edu dot edu
.gov dot gov
.mil dot mil
.us dot us
.pl dot pl
.ca dot ca
.au dot au
.jpg dot jpeg
50 changes: 17 additions & 33 deletions nemo_text_processing/text_normalization/en/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@
NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize()
NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
NEMO_HEX = pynini.union(*string.hexdigits).optimize()
NEMO_NON_BREAKING_SPACE = "\u00A0"
NEMO_NON_BREAKING_SPACE = u"\u00A0"
NEMO_SPACE = " "
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize()
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize()
NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()

Expand Down Expand Up @@ -79,36 +79,20 @@
delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
delete_preserve_order = pynini.closure(
pynutil.delete(" preserve_order: true")
| (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"'))
| (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\""))
)

# Common string literals; expand as you see fit.
username_string = "username"
double_quotes = '"'
domain_string = "domain"
protocol_string = "protocol"
slash = "/"
double_slash = "//"
triple_slash = "///"
file = "file"
period = "."
at = "@"
colon = ":"
https = "https"
http = "http"
www = "www"

suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
# _v = pynini.union("a", "e", "i", "o", "u")
_c = pynini.union(
"b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z",
"b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z"
)
_ies = NEMO_SIGMA + _c + pynini.cross("y", "ies")
_es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es")
_s = NEMO_SIGMA + pynutil.insert("s")

graph_plural = plurals._priority_union(
suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA,
suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA
).optimize()

SINGULAR_TO_PLURAL = graph_plural
Expand All @@ -123,8 +107,8 @@


def capitalized_input_graph(
graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None,
) -> "pynini.FstLike":
graph: 'pynini.FstLike', original_graph_weight: float = None, capitalized_graph_weight: float = None
) -> 'pynini.FstLike':
"""
Allow graph input to be capitalized, e.g. for ITN)

Expand All @@ -145,7 +129,7 @@ def capitalized_input_graph(
return graph


def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]):
def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']):
"""
Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.

Expand All @@ -157,7 +141,7 @@ def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]):
for rule, graph in graphs.items():
exporter[rule] = graph.optimize()
exporter.close()
logger.info(f"Created {file_name}")
logger.info(f'Created {file_name}')


def get_plurals(fst):
Expand All @@ -184,7 +168,7 @@ def get_singulars(fst):
return PLURAL_TO_SINGULAR @ fst


def convert_space(fst) -> "pynini.FstLike":
def convert_space(fst) -> 'pynini.FstLike':
"""
Converts space to nonbreaking space.
Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
Expand All @@ -207,7 +191,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED):
written_capitalized = written[0].upper() + written[1:]
additional_labels.extend(
[
[written_capitalized, spoken.capitalize(),], # first letter capitalized
[written_capitalized, spoken.capitalize()], # first letter capitalized
[
written_capitalized,
spoken.upper().replace(" AND ", " and "),
Expand All @@ -221,7 +205,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED):
logger.debug(f"This is weight {weight}")
if len(weight) == 0:
additional_labels.extend(
[[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],]
[[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]]
)
else:
additional_labels.extend(
Expand Down Expand Up @@ -253,7 +237,7 @@ def __init__(self, name: str, kind: str, deterministic: bool = True):
self._fst = None
self.deterministic = deterministic

self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far")
self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far')
if self.far_exist():
self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst()

Expand All @@ -264,14 +248,14 @@ def far_exist(self) -> bool:
return self.far_path.exists()

@property
def fst(self) -> "pynini.FstLike":
def fst(self) -> 'pynini.FstLike':
return self._fst

@fst.setter
def fst(self, fst):
self._fst = fst

def add_tokens(self, fst) -> "pynini.FstLike":
def add_tokens(self, fst) -> 'pynini.FstLike':
"""
Wraps class name around to given fst

Expand All @@ -283,7 +267,7 @@ def add_tokens(self, fst) -> "pynini.FstLike":
"""
return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")

def delete_tokens(self, fst) -> "pynini.FstLike":
def delete_tokens(self, fst) -> 'pynini.FstLike':
"""
Deletes class name wrap around output of given fst

Expand All @@ -302,4 +286,4 @@ def delete_tokens(self, fst) -> "pynini.FstLike":
+ delete_space
+ pynutil.delete("}")
)
return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA)
return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)
78 changes: 24 additions & 54 deletions nemo_text_processing/text_normalization/en/taggers/electronic.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,31 +16,17 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import ( # common string literals
from nemo_text_processing.text_normalization.en.graph_utils import (
MIN_NEG_WEIGHT,
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_NOT_SPACE,
NEMO_SIGMA,
NEMO_SPACE,
NEMO_UPPER,
TO_UPPER,
GraphFst,
at,
colon,
domain_string,
double_quotes,
double_slash,
file,
get_abs_path,
http,
https,
period,
protocol_string,
slash,
triple_slash,
username_string,
www,
insert_space,
)


Expand All @@ -61,19 +47,19 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
if deterministic:
numbers = NEMO_DIGIT
else:
numbers = pynutil.insert(NEMO_SPACE) + cardinal.long_numbers + pynutil.insert(NEMO_SPACE)
numbers = pynutil.insert(" ") + cardinal.long_numbers + pynutil.insert(" ")

cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT,)
cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT)

accepted_symbols = pynini.project(pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input")
accepted_common_domains = pynini.project(
pynini.string_file(get_abs_path("data/electronic/domain.tsv")), "input"
)

dict_words = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/words.tsv")), MIN_NEG_WEIGHT,)
dict_words = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/words.tsv")), MIN_NEG_WEIGHT)

dict_words_without_delimiter = dict_words + pynini.closure(
pynutil.add_weight(pynutil.insert(NEMO_SPACE) + dict_words, MIN_NEG_WEIGHT), 1,
pynutil.add_weight(pynutil.insert(" ") + dict_words, MIN_NEG_WEIGHT), 1
)
dict_words_graph = dict_words_without_delimiter | dict_words

Expand All @@ -90,75 +76,59 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
NEMO_ALPHA | numbers | accepted_symbols | dict_words_graph
)

username = (
pynutil.insert(username_string + colon + NEMO_SPACE + double_quotes)
+ username
+ pynutil.insert(double_quotes)
+ pynini.cross(at, NEMO_SPACE)
)
username = pynutil.insert("username: \"") + username + pynutil.insert("\"") + pynini.cross('@', ' ')

domain_graph = all_accepted_symbols_start + pynini.closure(
all_accepted_symbols_end | pynutil.add_weight(accepted_common_domains, MIN_NEG_WEIGHT)
)

protocol_symbols = pynini.closure((graph_symbols | pynini.cross(colon, "colon")) + pynutil.insert(NEMO_SPACE))
protocol_start = (
pynini.cross(https, (https.upper() + NEMO_SPACE)) | pynini.cross(http, (http.upper() + NEMO_SPACE))
) + (pynini.accep(colon + double_slash) @ protocol_symbols)
protocol_file_start = (
pynini.accep(file) + pynutil.insert(NEMO_SPACE) + (pynini.accep(colon + triple_slash) @ protocol_symbols)
protocol_symbols = pynini.closure((graph_symbols | pynini.cross(":", "colon")) + pynutil.insert(" "))
protocol_start = (pynini.cross("https", "HTTPS ") | pynini.cross("http", "HTTP ")) + (
pynini.accep("://") @ protocol_symbols
)
protocol_file_start = pynini.accep("file") + insert_space + (pynini.accep(":///") @ protocol_symbols)

protocol_end = pynutil.add_weight(
pynini.cross(www, (www.upper() + NEMO_SPACE)) + pynini.accep(period) @ protocol_symbols, -1000,
)
protocol_end = pynutil.add_weight(pynini.cross("www", "WWW ") + pynini.accep(".") @ protocol_symbols, -1000)
protocol = protocol_file_start | protocol_start | protocol_end | (protocol_start + protocol_end)

domain_graph_with_class_tags = (
pynutil.insert(domain_string + colon + NEMO_SPACE + double_quotes)
pynutil.insert("domain: \"")
+ pynini.compose(
NEMO_ALPHA + pynini.closure(NEMO_NOT_SPACE) + (NEMO_ALPHA | NEMO_DIGIT | pynini.accep(slash)),
NEMO_ALPHA + pynini.closure(NEMO_NOT_SPACE) + (NEMO_ALPHA | NEMO_DIGIT | pynini.accep("/")),
domain_graph,
).optimize()
+ pynutil.insert(double_quotes)
+ pynutil.insert("\"")
)

protocol = (
pynutil.insert(protocol_string + colon + NEMO_SPACE + double_quotes)
+ pynutil.add_weight(protocol, MIN_NEG_WEIGHT)
+ pynutil.insert(double_quotes)
)
protocol = pynutil.insert("protocol: \"") + pynutil.add_weight(protocol, MIN_NEG_WEIGHT) + pynutil.insert("\"")
# email
graph = pynini.compose(
NEMO_SIGMA + pynini.accep(at) + NEMO_SIGMA + pynini.accep(period) + NEMO_SIGMA,
NEMO_SIGMA + pynini.accep("@") + NEMO_SIGMA + pynini.accep(".") + NEMO_SIGMA,
username + domain_graph_with_class_tags,
)

# abc.com, abc.com/123-sm
# when only domain, make sure it starts and end with NEMO_ALPHA
graph |= (
pynutil.insert(domain_string + colon + NEMO_SPACE + double_quotes)
pynutil.insert("domain: \"")
+ pynini.compose(
NEMO_ALPHA
+ pynini.closure(NEMO_NOT_SPACE)
+ accepted_common_domains
+ pynini.closure(pynini.difference(NEMO_NOT_SPACE, pynini.accep(period))),
NEMO_ALPHA + pynini.closure(NEMO_NOT_SPACE) + accepted_common_domains + pynini.closure(NEMO_NOT_SPACE),
domain_graph,
).optimize()
+ pynutil.insert(double_quotes)
+ pynutil.insert("\"")
)
# www.abc.com/sdafsdf, or https://www.abc.com/asdfad or www.abc.abc/asdfad
graph |= protocol + pynutil.insert(NEMO_SPACE) + domain_graph_with_class_tags
graph |= protocol + pynutil.insert(" ") + domain_graph_with_class_tags

if deterministic:
# credit card cues
numbers = pynini.closure(NEMO_DIGIT, 4, 16)
cc_phrases = (
pynutil.insert(protocol_string + colon + NEMO_SPACE + double_quotes)
pynutil.insert("protocol: \"")
+ cc_cues
+ pynutil.insert(double_quotes + NEMO_SPACE + domain_string + colon + NEMO_SPACE + double_quotes)
+ pynutil.insert("\" domain: \"")
+ numbers
+ pynutil.insert(double_quotes)
+ pynutil.insert("\"")
)
graph |= cc_phrases

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ a@hotmail.fr~a at hotmail dot FR
a@hotmail.it~a at hotmail dot IT
a@aol.it~a at aol dot IT
a@msn.it~a at msn dot IT
cdf@abc.edu~cdf at abc dot edu
cdf@abc.edu~cdf at abc dot EDU
abc@gmail.abc~abc at gmail dot ABC
abc@abc.com~abc at abc dot com
asdf123@abc.com~asdf one two three at abc dot com
Expand Down Expand Up @@ -38,6 +38,4 @@ rtxprohelp@exchange.nvidia.com~RTX pro help at exchange dot NVIDIA dot com
enterpriseservices@nvidia.com~enterprise services at NVIDIA dot com
enterprise-services@nvidia.com~enterprise dash services at NVIDIA dot com
https://www.nvidia.com/dgx-basepod/~HTTPS colon slash slash WWW dot NVIDIA dot com slash DGX dash BASEPOD slash
i can use your card ending in 8876~i can use your card ending in eight eight seven six
here is mail.nasa.gov.~here is mail dot nasa dot gov.
check us out at some_university.edu.~check us out at some underscore university dot edu.
i can use your card ending in 8876~i can use your card ending in eight eight seven six