Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pipeline {
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-09-23-0'
SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-09-23-0'
ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def __init__(self, deterministic: bool = True):
ties |= pynutil.add_weight(alt_ties, -0.001)
ties |= pynutil.add_weight(pynini.cross("4", "förtio"), -0.001)
ties |= pynutil.add_weight(pynini.cross("4", "förti"), -0.001)
ties |= pynutil.add_weight(pynini.cross("2", "tju"), -0.001)

# Any double digit
graph_tens = teen
Expand Down Expand Up @@ -328,6 +329,8 @@ def __init__(self, deterministic: bool = True):
digit + insert_space + ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ graph_tens, -0.001
)
self.two_digits_read |= pynutil.add_weight(digit + insert_space + digit, -0.001)
self.any_read_digit |= self.two_digits_read
self.any_read_digit |= self.three_digits_read

self.graph |= zero

Expand Down
4 changes: 2 additions & 2 deletions nemo_text_processing/text_normalization/sv/taggers/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def get_quantity(
e.g. 1 miljon -> integer_part: "en" quantity: "miljon"
e.g. 1,5 miljoner -> integer_part: "en" fractional_part: "fem" quantity: "miljoner"

Args:
Args:
decimal: decimal FST
cardinal_up_to_hundred: cardinal FST
"""
Expand Down Expand Up @@ -112,7 +112,7 @@ def get_quantity(

class DecimalFst(GraphFst):
"""
Finite state transducer for classifying decimal, e.g.
Finite state transducer for classifying decimal, e.g.
-12,5006 biljon -> decimal { negative: "true" integer_part: "tolv" fractional_part: "fem noll noll sex" quantity: "biljon" }
1 biljon -> decimal { integer_part: "en" quantity: "biljon" }

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

class MeasureFst(GraphFst):
"""
Finite state transducer for classifying measure, suppletive aware, e.g.
Finite state transducer for classifying measure, suppletive aware, e.g.
-12kg -> measure { negative: "true" cardinal { integer: "tolv" } units: "kilogram" }
1kg -> measure { cardinal { integer: "ett" } units: "kilogram" }
,5kg -> measure { decimal { fractional_part: "fem" } units: "kilogram" }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

class MoneyFst(GraphFst):
"""
Finite state transducer for classifying money, suppletive aware, e.g.
Finite state transducer for classifying money, suppletive aware, e.g.
$12,05 -> money { integer_part: "tolv" currency_maj: "dollar" fractional_part: "fem" currency_min: "cent" preserve_order: true }
$12,0500 -> money { integer_part: "tolv" currency_maj: "dollar" fractional_part: "fem" currency_min: "cent" preserve_order: true }
$1 -> money { currency_maj: "dollar" integer_part: "en" }
Expand Down
48 changes: 36 additions & 12 deletions nemo_text_processing/text_normalization/sv/taggers/telephone.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_DIGIT,
NEMO_SPACE,
GraphFst,
delete_extra_space,
Expand Down Expand Up @@ -42,7 +43,7 @@ class TelephoneFst(GraphFst):
0XX-XXX XX
0XXX-XX XX XX
0XXX-XXX XX

See:
https://en.wikipedia.org/wiki/National_conventions_for_writing_telephone_numbers#Sweden
https://codegolf.stackexchange.com/questions/195787/format-a-swedish-phone-number
Expand Down Expand Up @@ -70,14 +71,23 @@ def __init__(self, deterministic: bool = True):

special_numbers = pynini.string_file(get_abs_path("data/telephone/special_numbers.tsv"))

passable = pynini.union(":", ": ", " ")
prompt_pass = pynini.closure(pynutil.delete(passable) + insert_space, 0, 1)
telephone_abbr = pynini.string_file(get_abs_path("data/telephone/telephone_abbr.tsv"))
telephone_abbr = telephone_abbr + prompt_pass
telephone_prompt = pynini.string_file(get_abs_path("data/telephone/telephone_prompt.tsv"))
prompt = pynutil.insert("prompt: \"") + telephone_prompt + pynutil.insert("\"")
prompt |= pynutil.insert("prompt: \"") + telephone_abbr + pynutil.insert("\"")
prompt |= pynutil.insert("prompt: \"") + telephone_prompt + NEMO_SPACE + telephone_abbr + pynutil.insert("\"")
prompt_as_code = pynutil.insert("country_code: \"") + telephone_prompt + pynutil.insert("\"")
prompt_as_code |= pynutil.insert("country_code: \"") + telephone_abbr + pynutil.insert("\"")
prompt_as_code |= (
pynutil.insert("country_code: \"") + telephone_prompt + NEMO_SPACE + telephone_abbr + pynutil.insert("\"")
)
prompt_inner = telephone_prompt | telephone_abbr
prompt_inner |= telephone_prompt + NEMO_SPACE + telephone_abbr

country_code = pynini.closure(pynini.cross("+", "plus "), 0, 1) + one_two_or_three_digits
country_code = pynutil.insert("country_code: \"") + country_code + pynutil.insert("\"")
country = pynini.closure(pynini.cross("+", "plus "), 0, 1) + one_two_or_three_digits
country_code = pynutil.insert("country_code: \"") + country + pynutil.insert("\"")
country_code |= prompt_as_code
country_code |= pynutil.insert("country_code: \"") + prompt_inner + NEMO_SPACE + country + pynutil.insert("\"")

opt_dash = pynini.closure(pynutil.delete("-"), 0, 1)
area_part = zero_after_country_code + one_two_or_three_digits + opt_dash + add_separator
Expand All @@ -102,19 +112,33 @@ def __init__(self, deterministic: bool = True):
prompt_pass = pynutil.delete(passable) + insert_space

special_numbers = pynutil.insert("number_part: \"") + special_numbers + pynutil.insert("\"")
prompt = prompt + prompt_pass
graph = pynini.union(
country_code + ensure_space + number_part,
country_code + ensure_space + number_part + ext_prompt + extension,
number_part + ext_prompt + extension,
prompt + number_part,
prompt + special_numbers,
prompt + country_code + number_part,
prompt + country_code + number_part + ext_prompt + extension,
prompt + number_part + ext_prompt + extension,
country_code + number_part,
country_code + special_numbers,
country_code + number_part + ext_prompt + extension,
)
self.tel_graph = graph.optimize()

# No need to be so exact here, but better for ITN to have it
three_digit_area_code_digit_two = pynini.union("1", "2", "3", "4", "7")
three_digit_area_code_no_zero = (three_digit_area_code_digit_two + NEMO_DIGIT) @ cardinal.two_digits_read
three_digit_area_code = zero_space + three_digit_area_code_no_zero
four_digit_area_code_digit_two = pynini.union("5", "6", "9")
four_digit_area_code_no_zero = (four_digit_area_code_digit_two + NEMO_DIGIT) @ cardinal.three_digits_read
four_digit_area_code = zero_space + four_digit_area_code_no_zero
two_digit_area_code = "08" @ cardinal.two_digits_read
self.area_codes = two_digit_area_code | three_digit_area_code | four_digit_area_code
self.area_codes_no_zero = (
three_digit_area_code_no_zero | four_digit_area_code_no_zero | pynini.cross("8", "åtta")
)
country_code_lead = pynini.cross("+", "plus") | pynini.cross("00", "noll noll")
raw_country_codes = pynini.string_file(get_abs_path("data/telephone/country_codes.tsv"))
self.country_code = country_code_lead + insert_space + (raw_country_codes @ cardinal.any_read_digit)
self.country_plus_area_code = self.country_code + NEMO_SPACE + self.area_codes_no_zero

# ip
ip_prompts = pynini.string_file(get_abs_path("data/telephone/ip_prompt.tsv"))
ip_graph = one_two_or_three_digits + (pynini.cross(".", " punkt ") + one_two_or_three_digits) ** 3
Expand Down
2 changes: 1 addition & 1 deletion nemo_text_processing/text_normalization/sv/taggers/time.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class TimeFst(GraphFst):
02:00 -> time { hours: "två" }
2:00 -> time { hours: "två" }
10:00:05 e.m. -> time { hours: "tio" minutes: "noll" seconds: "fem" suffix: "eftermiddag" }

Args:
cardinal: CardinalFst
deterministic: if True will provide a single transduction option,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@
class ClassifyFst(GraphFst):
"""
Final class that composes all other classification grammars. This class can process an entire sentence including punctuation.
For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File.
For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.

Args:
input_case: accepting either "lower_cased" or "cased" input.
deterministic: if True will provide a single transduction option,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@
class ClassifyFst(GraphFst):
"""
Final class that composes all other classification grammars. This class can process an entire sentence including punctuation.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archive (FAR) File.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archive (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.

Args:
input_case: accepting either "lower_cased" or "cased" input.
deterministic: if True will provide a single transduction option,
Expand Down
2 changes: 1 addition & 1 deletion nemo_text_processing/text_normalization/sv/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def get_abs_path(rel_path):

Args:
rel_path: relative path to this file

Returns absolute path
"""
return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class MeasureFst(GraphFst):
measure { negative: "true" cardinal { integer: "twelve" } units: "kilograms" } -> minus twelve kilograms
measure { decimal { integer_part: "twelve" fractional_part: "five" } units: "kilograms" } -> twelve point five kilograms
tokens { measure { units: "covid" decimal { integer_part: "nineteen" fractional_part: "five" } } } -> covid nineteen point five

Args:
decimal: DecimalFst
cardinal: CardinalFst
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,6 @@ def __init__(self, deterministic: bool = True):

optional_country_code = pynini.closure(country_code + delete_space + insert_space, 0, 1,)

prompt_part = (
pynutil.delete("prompt: \"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynini.closure(pynutil.add_weight(pynutil.delete(" "), -0.0001), 0, 1)
+ pynutil.delete("\"")
) + NEMO_SPACE

number_part = (
pynutil.delete("number_part: \"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
Expand All @@ -65,9 +58,6 @@ def __init__(self, deterministic: bool = True):
1,
)

graph = pynini.union(
prompt_part + optional_country_code + number_part + optional_extension,
optional_country_code + number_part + optional_extension,
)
graph = pynini.union(optional_country_code + number_part + optional_extension)
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_
self.fst = pynini.Far(far_file, mode="r")["verbalize"]
logging.info(f'VerbalizeFinalFst graph was restored from {far_file}.')
else:

verbalize = VerbalizeFst(deterministic=deterministic).fst
word = WordFst(deterministic=deterministic).fst
types = verbalize | word
Expand Down