diff --git a/docs/source/starthere/tutorials.rst b/docs/source/starthere/tutorials.rst index eb3cab1ee6bc..a526b28130e5 100644 --- a/docs/source/starthere/tutorials.rst +++ b/docs/source/starthere/tutorials.rst @@ -151,6 +151,9 @@ To run a tutorial: * - Text Processing - Inverse Text Normalization for ASR - `Inverse Text Normalization `_ + * - Text Processing + - Inverse Text Normalization for ASR - Thutmose Tagger + - `Inverse Text Normalization with Thutmose Tagger `_ * - Text Processing - Constructing Normalization Grammars with WFSTs - `WFST Tutorial `_ diff --git a/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_after_alignment.py b/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_after_alignment.py index 28caed27e533..33608b529c70 100644 --- a/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_after_alignment.py +++ b/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_after_alignment.py @@ -24,8 +24,7 @@ from collections import Counter from typing import Dict, Optional, TextIO, Tuple -from examples.nlp.text_normalization_as_tagging.dataset_preparation.utils import get_src_and_dst_for_alignment - +from nemo.collections.nlp.data.text_normalization_as_tagging.utils import get_src_and_dst_for_alignment from nemo.utils import logging parser = ArgumentParser(description="Produce data for the ThutmoseTaggerModel") diff --git a/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_for_alignment.py b/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_for_alignment.py index ca298f9ce5d9..9fe64c1105b8 100644 --- a/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_for_alignment.py +++ b/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_for_alignment.py @@ -47,7 +47,7 @@ from os.path import isdir, join from shutil import rmtree -from examples.nlp.text_normalization_as_tagging.dataset_preparation.utils import get_src_and_dst_for_alignment +from nemo.collections.nlp.data.text_normalization_as_tagging.utils import get_src_and_dst_for_alignment parser = ArgumentParser(description='Split corpus to subcorpora for giza alignment') parser.add_argument('--data_dir', type=str, required=True, help='Path to folder with data') diff --git a/examples/nlp/text_normalization_as_tagging/dataset_preparation/utils.py b/examples/nlp/text_normalization_as_tagging/dataset_preparation/utils.py deleted file mode 100644 index 906c1903024e..000000000000 --- a/examples/nlp/text_normalization_as_tagging/dataset_preparation/utils.py +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import re -from typing import Tuple - -from nemo.collections.nlp.data.text_normalization_as_tagging.utils import split_text_by_isalpha, spoken_preprocessing - -"""Utility functions for Thutmose Tagger data preparation.""" - - -def get_src_and_dst_for_alignment( - semiotic_class: str, written: str, spoken: str, lang: str -) -> Tuple[str, str, str, str]: - """Tokenize written and spoken span. - Args: - semiotic_class: str - lowercase semiotic class, ex. "cardinal" - written: str - written form, ex. "2015 году" - spoken: str - spoken form, ex. "две тысячи пятнадцатом году" - lang: str - language - Return: - src: str - written part, where digits and foreign letters are tokenized by characters, ex. "2 0 1 5" - dst: str - spoken part tokenized by space, ex. "две тысячи пятнадцатом" - same_begin: str - same_end: str - """ - written = written.casefold() - # ATTENTION!!! This is INPUT transformation! Need to do the same at inference time! - spoken = spoken_preprocessing(spoken) - - # remove same fragments at the beginning or at the end of spoken and written form - written_parts = written.split() - spoken_parts = spoken.split() - same_from_begin = 0 - same_from_end = 0 - for i in range(min(len(written_parts), len(spoken_parts))): - if written_parts[i] == spoken_parts[i]: - same_from_begin += 1 - else: - break - for i in range(min(len(written_parts), len(spoken_parts))): - if written_parts[-i - 1] == spoken_parts[-i - 1]: - same_from_end += 1 - else: - break - same_begin = written_parts[0:same_from_begin] - same_end = [] - if same_from_end == 0: - written = " ".join(written_parts[same_from_begin:]) - spoken = " ".join(spoken_parts[same_from_begin:]) - else: - written = " ".join(written_parts[same_from_begin:-same_from_end]) - spoken = " ".join(spoken_parts[same_from_begin:-same_from_end]) - same_end = written_parts[-same_from_end:] - - fragments = list(split_text_by_isalpha(written)) - written_tokens = [] - for frag in fragments: - if frag.isalpha(): - if semiotic_class == "plain" or semiotic_class == "letters" or semiotic_class == "electronic": - chars = list(frag.strip()) - chars[0] = "_" + chars[0] # prepend first symbol of a word with underscore - chars[-1] = chars[-1] + "_" # append underscore to the last symbol - written_tokens += chars - else: - written_tokens.append("_" + frag + "_") - else: - chars = list(frag.strip().replace(" ", "")) - if len(chars) > 0: - chars[0] = "_" + chars[0] # prepend first symbol of a non-alpha fragment with underscore - chars[-1] = chars[-1] + "_" # append underscore to the last symbol of a non-alpha fragment - written_tokens += chars - written_str = " ".join(written_tokens) - - # _н_ _._ _г_ _._ => _н._ _г._ - written_str = re.sub( - r"([abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя])_ _\._", r"\g<1>._", written_str - ) - # _тыс_ _. $ => _тыс._ _$ - written_str = re.sub( - r"([abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя])_ _\. ([^_])]", r"\g<1>._ _\g<2>", written_str - ) - - if semiotic_class == "ordinal": - # _8 2 -_ _ом_ => _8 2-ом_ - written_str = re.sub( - r"([\d]) -_ _([abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя]+)_", - r"\g<1>-\g<2>_", - written_str, - ) - # _8 8_ _й_ _8 8й_ - written_str = re.sub( - r"([\d])_ _([abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя]+)_", r"\g<1>\g<2>_", written_str - ) - - if semiotic_class == "cardinal": - # _2 5 -_ _ти_ => _2 5-ти_ - written_str = re.sub(r"([\d]) -_ _(ти)_", r"\g<1>-\g<2>_", written_str) - written_str = re.sub(r"([\d]) -_ _(и)_", r"\g<1>-\g<2>_", written_str) - written_str = re.sub(r"([\d]) -_ _(мя)_", r"\g<1>-\g<2>_", written_str) - written_str = re.sub(r"([\d]) -_ _(ех)_", r"\g<1>-\g<2>_", written_str) - - # _i b m_ _'_ _s_ => _i b m's_ - if lang == "en": - written_str = re.sub(r"_ _'_ _s_", r"'s_", written_str) - - if semiotic_class == "date" and lang == "en": - # _1 9 8 0_ _s_ => _1 9 8 0s_ - written_str = re.sub(r"([\d])_ _s_", r"\g<1>s_", written_str) - # _1 9 5 0 '_ _s_ => _1 9 5 0's_ - written_str = re.sub(r"([\d]) '_ _s_", r"\g<1>'s_", written_str) - # _wednesday_ _2 6_ _th_ _september_ _2 0 1 2_ => _wednesday_ _2 6th_ _september_ _2 0 1 2_ - written_str = re.sub(r"([\d])_ _th_", r"\g<1>th_", written_str) - # _wednesday_ _may_ _2 1_ _st_ _, 2 0 1 4_ => _wednesday_ _may_ _2 1st_ _, 2 0 1 4_ - written_str = re.sub(r"([\d])_ _st_", r"\g<1>st_", written_str) - # _wednesday_ _2 3_ _rd_ _july_ _2 0 1 4_ => _wednesday_ _2 3rd_ _july_ _2 0 1 4_ - written_str = re.sub(r"([\d])_ _rd_", r"\g<1>rd_", written_str) - # _wednesday_ _2 2_ _nd_ _july_ _2 0 1 4_ => _wednesday_ _2 2nd_ _july_ _2 0 1 4_ - written_str = re.sub(r"([\d])_ _nd_", r"\g<1>nd_", written_str) - - written_str = re.sub(r"_mon_ _\. ", r"_mon._ ", written_str) - written_str = re.sub(r"_tue_ _\. ", r"_tue._ ", written_str) - written_str = re.sub(r"_wen_ _\. ", r"_wen._ ", written_str) - written_str = re.sub(r"_thu_ _\. ", r"_thu._ ", written_str) - written_str = re.sub(r"_fri_ _\. ", r"_fri._ ", written_str) - written_str = re.sub(r"_sat_ _\. ", r"_sat._ ", written_str) - written_str = re.sub(r"_sun_ _\. ", r"_sun._ ", written_str) - - written_str = re.sub(r"_jan_ _\. ", r"_jan._ ", written_str) - written_str = re.sub(r"_feb_ _\. ", r"_feb._ ", written_str) - written_str = re.sub(r"_mar_ _\. ", r"_mar._ ", written_str) - written_str = re.sub(r"_apr_ _\. ", r"_apr._ ", written_str) - written_str = re.sub(r"_may_ _\. ", r"_may._ ", written_str) - written_str = re.sub(r"_jun_ _\. ", r"_jun._ ", written_str) - written_str = re.sub(r"_jul_ _\. ", r"_jul._ ", written_str) - written_str = re.sub(r"_aug_ _\. ", r"_aug._ ", written_str) - written_str = re.sub(r"_sep_ _\. ", r"_sep._ ", written_str) - written_str = re.sub(r"_oct_ _\. ", r"_oct._ ", written_str) - written_str = re.sub(r"_nov_ _\. ", r"_nov._ ", written_str) - written_str = re.sub(r"_dec_ _\. ", r"_dec._ ", written_str) - - if semiotic_class == "date" and lang == "ru": - # _1 8 . 0 8 . 2 0 0 1_ => _1 8_ .08. _2 0 0 1_ - # _1 8 / 0 8 / 2 0 0 1_ => _1 8_ /08/ _2 0 0 1_ - # _1 8 - 0 8 - 2 0 0 1_ => _1 8_ -08- _2 0 0 1_ - written_str = re.sub(r"([\d]) \. ([01]) ([0123456789]) \. ([\d])", r"\g<1>_ .\g<2>\g<3>. _\g<4>", written_str) - written_str = re.sub(r"([\d]) / ([01]) ([0123456789]) / ([\d])", r"\g<1>_ /\g<2>\g<3>/ _\g<4>", written_str) - written_str = re.sub(r"([\d]) - ([01]) ([0123456789]) - ([\d])", r"\g<1>_ -\g<2>\g<3>- _\g<4>", written_str) - # _1 8 . 8 . 2 0 0 1_ => _1 8_ .8. _2 0 0 1_ - # _1 8 / 8 / 2 0 0 1_ => _1 8_ /8/ _2 0 0 1_ - # _1 8 - 8 - 2 0 0 1_ => _1 8_ -8- _2 0 0 1_ - written_str = re.sub(r"([\d]) \. ([123456789]) \. ([\d])", r"\g<1>_ .\g<2>. _\g<3>", written_str) - written_str = re.sub(r"([\d]) / ([123456789]) / ([\d])", r"\g<1>_ /\g<2>/ _\g<3>", written_str) - written_str = re.sub(r"([\d]) - ([123456789]) - ([\d])", r"\g<1>_ -\g<2>- _\g<3>", written_str) - - if semiotic_class == "money": - # if a span start with currency, move it to the end - # "_$ 2 5_" => "_2 5_ _$<<" #<< means "at post-processing move to the beginning of th semiotic span" - written_str = re.sub( - r"^(_[^0123456789abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя]) ([\d].*)$", - r"_\g<2> \g<1><<", - written_str, - ) - - # "_us_ _$ 7 0 0_" => "_us__$ 7 0 0_" - written_str = re.sub(r"^_us_ _\$ ([\d].*)$", r"_\g<1> _us__$<<", written_str) - - # "_2 5 $_" => "_2 5_ _$_" #insert space between last digit and dollar sign - written_str = re.sub( - r"([\d]) ([^0123456789abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя_]_)", - r"\g<1>_ _\g<2>", - written_str, - ) - - if semiotic_class == "time": - # "_pm_ _1 0_" => "_1 0_ _pm_<<" - written_str = re.sub(r"^(_[ap]m_) (_[\d].*)$", r"\g<2> \g<1><<", written_str) - - # "_8 : 0 0_ _a._ _m._ => _8:00_ _a._ _m._" - # "_1 2 : 0 0_ _a._ _m._ => _1 2:00_ _a._ _m._" - written_str = re.sub(r"(\d) [:.] 0 0_", r"\g<1>:00_", written_str) - - # "_2 : 4 2 : 4 4_" => "_2: 4 2: 4 4_" - written_str = re.sub(r"(\d) [:.] ", r"\g<1>: ", written_str) - - if semiotic_class == "measure": - # "_6 5 8_ _см_ _³ ._" => " _6 5 8_ _³> _см._" - # > means "at post-processing swap with the next token to the right" - written_str = re.sub( - r"(_[abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя.]+_) (_[³²]_?)", - r"\g<2>> \g<1>", - written_str, - ) - - return written_str, spoken, " ".join(same_begin), " ".join(same_end) diff --git a/examples/nlp/text_normalization_as_tagging/install_requirements.sh b/examples/nlp/text_normalization_as_tagging/install_requirements.sh new file mode 100644 index 000000000000..f54a6cb3f8fa --- /dev/null +++ b/examples/nlp/text_normalization_as_tagging/install_requirements.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +git clone https://github.com/moses-smt/giza-pp.git giza-pp +cd giza-pp +make +cd .. diff --git a/nemo/collections/nlp/data/text_normalization_as_tagging/tagging.py b/nemo/collections/nlp/data/text_normalization_as_tagging/tagging.py index 6b17377b0225..6e27b561614e 100644 --- a/nemo/collections/nlp/data/text_normalization_as_tagging/tagging.py +++ b/nemo/collections/nlp/data/text_normalization_as_tagging/tagging.py @@ -207,9 +207,10 @@ def realize_output(self, tags: List[Tag], semiotic_labels: List[str]) -> Tuple[s output_tokens.append(frag.replace(" ", "").replace("_", "")) else: output_tokens.append(frag.strip().replace("_", "")) - + output_str = " ".join(output_tokens) + output_str = re.sub(r" +", " ", output_str) return ( - " ".join(output_tokens), + output_str, " ".join(self.source_tokens), " ".join(out_tags_without_swap), output_tags_with_swap_str, diff --git a/nemo/collections/nlp/data/text_normalization_as_tagging/utils.py b/nemo/collections/nlp/data/text_normalization_as_tagging/utils.py index 562f4703464c..253f7a41c703 100644 --- a/nemo/collections/nlp/data/text_normalization_as_tagging/utils.py +++ b/nemo/collections/nlp/data/text_normalization_as_tagging/utils.py @@ -15,7 +15,7 @@ import re from itertools import groupby -from typing import Dict, List +from typing import Dict, List, Tuple """Utility functions for Thutmose Tagger.""" @@ -118,3 +118,190 @@ def spoken_preprocessing(spoken: str) -> str: spoken = re.sub(r" фунтом стерлингов", r" фунтом-стерлингов", spoken) return spoken + + +## This function is used only in data preparation (examples/nlp/normalisation_as_tagging/dataset_preparation) +def get_src_and_dst_for_alignment( + semiotic_class: str, written: str, spoken: str, lang: str +) -> Tuple[str, str, str, str]: + """Tokenize written and spoken span. + Args: + semiotic_class: str - lowercase semiotic class, ex. "cardinal" + written: str - written form, ex. "2015 году" + spoken: str - spoken form, ex. "две тысячи пятнадцатом году" + lang: str - language + Return: + src: str - written part, where digits and foreign letters are tokenized by characters, ex. "2 0 1 5" + dst: str - spoken part tokenized by space, ex. "две тысячи пятнадцатом" + same_begin: str + same_end: str + """ + written = written.casefold() + # ATTENTION!!! This is INPUT transformation! Need to do the same at inference time! + spoken = spoken_preprocessing(spoken) + + # remove same fragments at the beginning or at the end of spoken and written form + written_parts = written.split() + spoken_parts = spoken.split() + same_from_begin = 0 + same_from_end = 0 + for i in range(min(len(written_parts), len(spoken_parts))): + if written_parts[i] == spoken_parts[i]: + same_from_begin += 1 + else: + break + for i in range(min(len(written_parts), len(spoken_parts))): + if written_parts[-i - 1] == spoken_parts[-i - 1]: + same_from_end += 1 + else: + break + same_begin = written_parts[0:same_from_begin] + same_end = [] + if same_from_end == 0: + written = " ".join(written_parts[same_from_begin:]) + spoken = " ".join(spoken_parts[same_from_begin:]) + else: + written = " ".join(written_parts[same_from_begin:-same_from_end]) + spoken = " ".join(spoken_parts[same_from_begin:-same_from_end]) + same_end = written_parts[-same_from_end:] + + fragments = list(split_text_by_isalpha(written)) + written_tokens = [] + for frag in fragments: + if frag.isalpha(): + if semiotic_class == "plain" or semiotic_class == "letters" or semiotic_class == "electronic": + chars = list(frag.strip()) + chars[0] = "_" + chars[0] # prepend first symbol of a word with underscore + chars[-1] = chars[-1] + "_" # append underscore to the last symbol + written_tokens += chars + else: + written_tokens.append("_" + frag + "_") + else: + chars = list(frag.strip().replace(" ", "")) + if len(chars) > 0: + chars[0] = "_" + chars[0] # prepend first symbol of a non-alpha fragment with underscore + chars[-1] = chars[-1] + "_" # append underscore to the last symbol of a non-alpha fragment + written_tokens += chars + written_str = " ".join(written_tokens) + + # _н_ _._ _г_ _._ => _н._ _г._ + written_str = re.sub( + r"([abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя])_ _\._", r"\g<1>._", written_str + ) + # _тыс_ _. $ => _тыс._ _$ + written_str = re.sub( + r"([abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя])_ _\. ([^_])]", r"\g<1>._ _\g<2>", written_str + ) + + if semiotic_class == "ordinal": + # _8 2 -_ _ом_ => _8 2-ом_ + written_str = re.sub( + r"([\d]) -_ _([abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя]+)_", + r"\g<1>-\g<2>_", + written_str, + ) + # _8 8_ _й_ _8 8й_ + written_str = re.sub( + r"([\d])_ _([abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя]+)_", r"\g<1>\g<2>_", written_str + ) + + if semiotic_class == "cardinal": + # _2 5 -_ _ти_ => _2 5-ти_ + written_str = re.sub(r"([\d]) -_ _(ти)_", r"\g<1>-\g<2>_", written_str) + written_str = re.sub(r"([\d]) -_ _(и)_", r"\g<1>-\g<2>_", written_str) + written_str = re.sub(r"([\d]) -_ _(мя)_", r"\g<1>-\g<2>_", written_str) + written_str = re.sub(r"([\d]) -_ _(ех)_", r"\g<1>-\g<2>_", written_str) + + # _i b m_ _'_ _s_ => _i b m's_ + if lang == "en": + written_str = re.sub(r"_ _'_ _s_", r"'s_", written_str) + + if semiotic_class == "date" and lang == "en": + # _1 9 8 0_ _s_ => _1 9 8 0s_ + written_str = re.sub(r"([\d])_ _s_", r"\g<1>s_", written_str) + # _1 9 5 0 '_ _s_ => _1 9 5 0's_ + written_str = re.sub(r"([\d]) '_ _s_", r"\g<1>'s_", written_str) + # _wednesday_ _2 6_ _th_ _september_ _2 0 1 2_ => _wednesday_ _2 6th_ _september_ _2 0 1 2_ + written_str = re.sub(r"([\d])_ _th_", r"\g<1>th_", written_str) + # _wednesday_ _may_ _2 1_ _st_ _, 2 0 1 4_ => _wednesday_ _may_ _2 1st_ _, 2 0 1 4_ + written_str = re.sub(r"([\d])_ _st_", r"\g<1>st_", written_str) + # _wednesday_ _2 3_ _rd_ _july_ _2 0 1 4_ => _wednesday_ _2 3rd_ _july_ _2 0 1 4_ + written_str = re.sub(r"([\d])_ _rd_", r"\g<1>rd_", written_str) + # _wednesday_ _2 2_ _nd_ _july_ _2 0 1 4_ => _wednesday_ _2 2nd_ _july_ _2 0 1 4_ + written_str = re.sub(r"([\d])_ _nd_", r"\g<1>nd_", written_str) + + written_str = re.sub(r"_mon_ _\. ", r"_mon._ ", written_str) + written_str = re.sub(r"_tue_ _\. ", r"_tue._ ", written_str) + written_str = re.sub(r"_wen_ _\. ", r"_wen._ ", written_str) + written_str = re.sub(r"_thu_ _\. ", r"_thu._ ", written_str) + written_str = re.sub(r"_fri_ _\. ", r"_fri._ ", written_str) + written_str = re.sub(r"_sat_ _\. ", r"_sat._ ", written_str) + written_str = re.sub(r"_sun_ _\. ", r"_sun._ ", written_str) + + written_str = re.sub(r"_jan_ _\. ", r"_jan._ ", written_str) + written_str = re.sub(r"_feb_ _\. ", r"_feb._ ", written_str) + written_str = re.sub(r"_mar_ _\. ", r"_mar._ ", written_str) + written_str = re.sub(r"_apr_ _\. ", r"_apr._ ", written_str) + written_str = re.sub(r"_may_ _\. ", r"_may._ ", written_str) + written_str = re.sub(r"_jun_ _\. ", r"_jun._ ", written_str) + written_str = re.sub(r"_jul_ _\. ", r"_jul._ ", written_str) + written_str = re.sub(r"_aug_ _\. ", r"_aug._ ", written_str) + written_str = re.sub(r"_sep_ _\. ", r"_sep._ ", written_str) + written_str = re.sub(r"_oct_ _\. ", r"_oct._ ", written_str) + written_str = re.sub(r"_nov_ _\. ", r"_nov._ ", written_str) + written_str = re.sub(r"_dec_ _\. ", r"_dec._ ", written_str) + + if semiotic_class == "date" and lang == "ru": + # _1 8 . 0 8 . 2 0 0 1_ => _1 8_ .08. _2 0 0 1_ + # _1 8 / 0 8 / 2 0 0 1_ => _1 8_ /08/ _2 0 0 1_ + # _1 8 - 0 8 - 2 0 0 1_ => _1 8_ -08- _2 0 0 1_ + written_str = re.sub(r"([\d]) \. ([01]) ([0123456789]) \. ([\d])", r"\g<1>_ .\g<2>\g<3>. _\g<4>", written_str) + written_str = re.sub(r"([\d]) / ([01]) ([0123456789]) / ([\d])", r"\g<1>_ /\g<2>\g<3>/ _\g<4>", written_str) + written_str = re.sub(r"([\d]) - ([01]) ([0123456789]) - ([\d])", r"\g<1>_ -\g<2>\g<3>- _\g<4>", written_str) + # _1 8 . 8 . 2 0 0 1_ => _1 8_ .8. _2 0 0 1_ + # _1 8 / 8 / 2 0 0 1_ => _1 8_ /8/ _2 0 0 1_ + # _1 8 - 8 - 2 0 0 1_ => _1 8_ -8- _2 0 0 1_ + written_str = re.sub(r"([\d]) \. ([123456789]) \. ([\d])", r"\g<1>_ .\g<2>. _\g<3>", written_str) + written_str = re.sub(r"([\d]) / ([123456789]) / ([\d])", r"\g<1>_ /\g<2>/ _\g<3>", written_str) + written_str = re.sub(r"([\d]) - ([123456789]) - ([\d])", r"\g<1>_ -\g<2>- _\g<3>", written_str) + + if semiotic_class == "money": + # if a span start with currency, move it to the end + # "_$ 2 5_" => "_2 5_ _$<<" #<< means "at post-processing move to the beginning of th semiotic span" + written_str = re.sub( + r"^(_[^0123456789abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя]) ([\d].*)$", + r"_\g<2> \g<1><<", + written_str, + ) + + # "_us_ _$ 7 0 0_" => "_us__$ 7 0 0_" + written_str = re.sub(r"^_us_ _\$ ([\d].*)$", r"_\g<1> _us__$<<", written_str) + + # "_2 5 $_" => "_2 5_ _$_" #insert space between last digit and dollar sign + written_str = re.sub( + r"([\d]) ([^0123456789abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя_]_)", + r"\g<1>_ _\g<2>", + written_str, + ) + + if semiotic_class == "time": + # "_pm_ _1 0_" => "_1 0_ _pm_<<" + written_str = re.sub(r"^(_[ap]m_) (_[\d].*)$", r"\g<2> \g<1><<", written_str) + + # "_8 : 0 0_ _a._ _m._ => _8:00_ _a._ _m._" + # "_1 2 : 0 0_ _a._ _m._ => _1 2:00_ _a._ _m._" + written_str = re.sub(r"(\d) [:.] 0 0_", r"\g<1>:00_", written_str) + + # "_2 : 4 2 : 4 4_" => "_2: 4 2: 4 4_" + written_str = re.sub(r"(\d) [:.] ", r"\g<1>: ", written_str) + + if semiotic_class == "measure": + # "_6 5 8_ _см_ _³ ._" => " _6 5 8_ _³> _см._" + # > means "at post-processing swap with the next token to the right" + written_str = re.sub( + r"(_[abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя.]+_) (_[³²]_?)", + r"\g<2>> \g<1>", + written_str, + ) + + return written_str, spoken, " ".join(same_begin), " ".join(same_end) diff --git a/tutorials/text_processing/ITN_with_Thutmose_Tagger.ipynb b/tutorials/text_processing/ITN_with_Thutmose_Tagger.ipynb new file mode 100644 index 000000000000..50bf88a1e25c --- /dev/null +++ b/tutorials/text_processing/ITN_with_Thutmose_Tagger.ipynb @@ -0,0 +1,1047 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VFOY_ljrReXk" + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run this cell to set up dependencies.\n", + "\"\"\"\n", + "\n", + "import os\n", + "\n", + "# install NeMo\n", + "BRANCH = 'r1.9.0'\n", + "\n", + "GITHUB_ACCOUNT = 'NVIDIA' # change this if using a fork\n", + "\n", + "# either provide a path to local NeMo repository with NeMo already installed or git clone\n", + "\n", + "# option #1: local path to NeMo repo with NeMo already installed\n", + "NEMO_DIR_PATH = \"NeMo\"\n", + "\n", + "# option #2: download NeMo repo\n", + "if 'google.colab' in str(get_ipython()) or not os.path.exists(NEMO_DIR_PATH):\n", + " ! git clone -b $BRANCH https://github.com/{GITHUB_ACCOUNT}/NeMo\n", + " % cd NeMo\n", + " ! python -m pip install git+https://github.com/{GITHUB_ACCOUNT}/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", + " % cd .." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "V8RfIztARxKH" + }, + "outputs": [], + "source": [ + "# If you're not using Colab, you might need to upgrade jupyter notebook to avoid the following error:\n", + "# 'ImportError: IProgress not found. Please update jupyter and ipywidgets.'\n", + "\n", + "! pip install ipywidgets\n", + "! jupyter nbextension enable --py widgetsnbextension\n", + "\n", + "# Please restart the kernel after running this cell" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oaTOPJHhTteF" + }, + "source": [ + "# Task Description\n", + "**Inverse text normalization (ITN)** is an important post-processing step within an automatic speech recognition (ASR) system. \n", + "ITN transforms spoken-domain text into its written form:\n", + "\n", + "> **Input:** \"on may third we paid one hundred and twenty three dollars\"\n", + "\n", + "> **Output:** \"on may 3 we paid \\$123\".\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sxo-kHP7frEX" + }, + "source": [ + "# Thutmose Tagger approach\n", + "We aim to do the following:\n", + "1. Align ITN expressions from the [Google Text normalization dataset](https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish) on a granular level using [GIZA++](https://github.com/moses-smt/giza-pp), to get a monotonic one-to-one correspondence between each *spoken word* and corresponding *fragments* in written form. \n", + "2. Get a restricted vocabulary of target fragments (tags) that can cover most spoken-written pair conversions.\n", + "3. Build training dataset, where the input is the sentence in spoken form and the output is tags for all input words. \n", + "4. Train a token classifier neural model (see Figure below). \n", + "5. Apply a simple postprocessing procedure upon the tag sequence to get the final output" + ] + }, + { + "cell_type": "markdown", + "source": [ + "![Thutmose Tagger Architecture](images/thutmose_tagger_architecture.png)" + ], + "metadata": { + "id": "RG403l1gKyRy" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aMPeNtAracI9" + }, + "source": [ + "# Dataset\n", + "\n", + "The full English part of [Google Text normalization dataset](https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish) consists of 1.1 billion words. For this tutorial we use only small subset of it.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EqnuAgNNcVY-" + }, + "outputs": [], + "source": [ + "! wget \"https://multilangaudiosamples.s3.us-east-2.amazonaws.com/en_data_small.zip\" \".\"\n", + "! unzip en_data_small" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CkJ3LLaxRNFh" + }, + "outputs": [], + "source": [ + "## actually we do not need separate dev and test data in this tutorial, so just copy it \n", + "!cp -r en_data_small/test en_data_small/dev" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HLoXgnpMVACe" + }, + "source": [ + "\n", + "The dataset contains unnormalized (i.e. written form) and normalized (i.e. spoken form) sentence pairs that are aligned *on a phrase-level*. The normalized text is synthetic - obtained with the [Kestrel TTS text normalization system](https://www.cambridge.org/core/journals/natural-language-engineering/article/abs/kestrel-tts-text-normalization-system/F0C18A3F596B75D83B75C479E23795DA), so it’s not considered 100% correct.\n", + "\n", + "```\n", + "PLAIN Retrieved \n", + "DATE 18 April 2013 the eighteenth of april twenty thirteen\n", + "PUNCT . sil\n", + " \n", + "PLAIN Neuhorst \n", + "PUNCT ( sil\n", + "PLAIN Canada \n", + "DATE 2006 two thousand six\n", + "PLAIN Census \n", + "PLAIN population \n", + "CARDINAL 126 one hundred twenty six\n", + "PUNCT ) sil\n", + "PLAIN is \n", + "PLAIN a \n", + "PLAIN small \n", + "PLAIN hamlet \n", + "PLAIN in \n", + "PLAIN Saskatchewan \n", + "PUNCT , sil\n", + "PLAIN Canada \n", + "PLAIN about \n", + "CARDINAL 30 thirty\n", + "PLAIN minutes \n", + "PLAIN north \n", + "PLAIN of \n", + "PLAIN Saskatoon \n", + "PUNCT . sil\n", + " \n", + "```\n", + "\n", + "The following classes appear in the dataset:\n", + "* ADDRESS\n", + "* CARDINAL\n", + "* DATE\n", + "* DECIMAL\n", + "* DIGIT\n", + "* ELECTRONIC\n", + "* FRACTION\n", + "* LETTERS\n", + "* MEASURE\n", + "* MONEY\n", + "* ORDINAL\n", + "* PLAIN\n", + "* PUNCT\n", + "* TELEPHONE\n", + "* TIME\n", + "* VERBATIM\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# 1. Align ITN expressions on a granular-level" + ], + "metadata": { + "id": "rewQY1pbPeq8" + } + }, + { + "cell_type": "markdown", + "source": [ + "Let's download and compile GIZA++ as we will need it soon" + ], + "metadata": { + "id": "5cLXx7qdPpUK" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LNLjIDUJdY5f" + }, + "outputs": [], + "source": [ + "! git clone https://github.com/moses-smt/giza-pp.git giza-pp\n", + "%cd giza-pp\n", + "! ls\n", + "! make\n", + "%cd .." + ] + }, + { + "cell_type": "markdown", + "source": [ + "Do some imports" + ], + "metadata": { + "id": "2AfIeiu_P0Ik" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mz8_O4kfS0fH" + }, + "outputs": [], + "source": [ + "from nemo.collections import nlp as nemo_nlp\n", + "from nemo.utils.exp_manager import exp_manager\n", + "import nemo\n", + "\n", + "import wget \n", + "import torch\n", + "import pytorch_lightning as pl\n", + "from omegaconf import OmegaConf\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uYW_qsDgkhCw" + }, + "source": [ + "First we need to prepare the input data for the aligner (GIZA++).\n", + "\n", + "We regard the corpus of ITN phrase-pairs as a parallel corpus. Parallel means that each pair has an equivalent meaning, though they may consist of any number of tokens and the task of an aligner is to find which source tokens corresponds to which target tokens.\n", + "The spoken phrase is tokenized by word boundary, while the written phrase is tokenized as follows: \n", + "1. All alphabetic sequences are separate tokens\n", + "2. In numeric sequences each character is a separate token.\n", + "3. All non-alphanumeric characters are separate tokens.\n", + "4. We add an underscore symbol to mark the beginning and end of a\n", + "sequence for future detokenization. \n", + "\n", + "Example\n", + "> **Spoken:** `january thirtieth two thousand five`\n", + "\n", + "> **Written initial:** `jan 30, 2005`\n", + "\n", + "> **Written tokenized**: `_jan_ _3 0 , 2 0 0 5_`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "miXYxOv_mNVo" + }, + "source": [ + "The script [prepare_corpora_for_alignment.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_for_alignment.py) prepares the described parallel corpora. It extracts all unique ITN phrase-pairs from the Google TN dataset, tokenizes them as described above and stores in separate folders for each semiotic class. It also generates a bash script for running the alignment. At the end it prints how many examples it has found:\n", + "```\n", + "content/alignment/punct has 920953 instances\n", + "content/alignment/date has 150499 instances\n", + "content/alignment/letters has 68340 instances\n", + "content/alignment/cardinal has 61029 instances\n", + "...\n", + "``` " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_A12y5zNn4O0" + }, + "outputs": [], + "source": [ + "WORK_DIR=!pwd # returns array containing a single path, \n", + "WORK_DIR=WORK_DIR[0]\n", + "\n", + "CORPUS_LANG=\"en\"\n", + "if 'google.colab' in str(get_ipython()) or not os.path.exists(NEMO_DIR_PATH):\n", + " NEMO_PATH=WORK_DIR + \"/NeMo\"\n", + "else:\n", + " NEMO_PATH=NEMO_DIR_PATH\n", + "GIZA_BIN_DIR=WORK_DIR + \"/giza-pp/GIZA++-v2\"\n", + "MCKLS_BINARY=WORK_DIR + \"/giza-pp/mkcls-v2/mkcls\"\n", + "CORPUS_DIR=WORK_DIR + \"/en_data_small\"\n", + "ALIGNMENT_DIR=WORK_DIR + \"/alignment\"\n", + "\n", + "!mkdir {ALIGNMENT_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BguRSoIXesFx" + }, + "outputs": [], + "source": [ + "!python {NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_for_alignment.py \\\n", + " --data_dir={CORPUS_DIR} \\\n", + " --out_dir={ALIGNMENT_DIR} \\\n", + " --giza_dir={GIZA_BIN_DIR} \\\n", + " --mckls_binary={MCKLS_BINARY} \\\n", + " --lang={CORPUS_LANG}" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Let's exclude punct class, as our itn task doesn't require to restore punctuation marks" + ], + "metadata": { + "id": "v8LscfJrLUeg" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VNOQ4nW2yF6I" + }, + "outputs": [], + "source": [ + "!rm -r {ALIGNMENT_DIR}/punct\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Let's run GIZA++ alignment. \n", + "In this tutorial we only work with three semiotic classes: date, letters and cardinal (in real setting all classes are used, excluding punct).\n", + "\n", + "**Attention**: the environment variable USER should be defined with any value, otherwise GIZA++ ends with segmenation fault. " + ], + "metadata": { + "id": "uUQMhEKGT7gv" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UPxcPu0_Xh2Y" + }, + "outputs": [], + "source": [ + "!chmod +x {ALIGNMENT_DIR}/date/run.sh\n", + "!chmod +x {ALIGNMENT_DIR}/letters/run.sh\n", + "!chmod +x {ALIGNMENT_DIR}/cardinal/run.sh" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gOCv-ctbU3Rv" + }, + "outputs": [], + "source": [ + "## It is necessary to specify environment variable USER=, otherwise GIZA++ terminates with a segfault \n", + "\n", + "%cd {ALIGNMENT_DIR}/date\n", + "! export USER=\"user\"; ./run.sh\n", + "%cd ../..\n", + "\n", + "%cd {ALIGNMENT_DIR}/letters\n", + "! export USER=\"user\"; ./run.sh\n", + "%cd ../..\n", + "\n", + "%cd {ALIGNMENT_DIR}/cardinal\n", + "! export USER=\"user\"; ./run.sh\n", + "%cd ../.." + ] + }, + { + "cell_type": "markdown", + "source": [ + "GIZA++ will generate many files in our class folders, but we need only two files with final alignments, those with suffixes `A3.final`. The two files correspond to the alignments produced by two GIZA++ runs - direct and reverse (switching source and target corpus). This is a common practice, it allows us to find safer alignment points - tokens that were aligned to one another in both runs. The script [extract_giza_alignments.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/text_normalization_as_tagging/dataset_preparation/extract_giza_alignments.py) heuristically combines these two GIZA++ alignments. It also applies a bunch of regular expressions to correct some alignment mistakes." + ], + "metadata": { + "id": "ueJYVF0cU3ic" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "j5WpPkzHNICP" + }, + "outputs": [], + "source": [ + "! python {NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/extract_giza_alignments.py \\\n", + " --mode=itn \\\n", + " --giza_dir={ALIGNMENT_DIR}/date \\\n", + " --giza_suffix=\"A3.final\" \\\n", + " --out_filename=itn.out \\\n", + " --lang={CORPUS_LANG}\n", + "! python {NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/extract_giza_alignments.py \\\n", + " --mode=itn \\\n", + " --giza_dir={ALIGNMENT_DIR}/letters \\\n", + " --giza_suffix=\"A3.final\" \\\n", + " --out_filename=itn.out \\\n", + " --lang={CORPUS_LANG}\n", + "! python {NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/extract_giza_alignments.py \\\n", + " --mode=itn \\\n", + " --giza_dir={ALIGNMENT_DIR}/cardinal \\\n", + " --giza_suffix=\"A3.final\" \\\n", + " --out_filename=itn.out \\\n", + " --lang={CORPUS_LANG}" + ] + }, + { + "cell_type": "markdown", + "source": [ + "When we prepared the input corpus of ITN pairs for GIZA++, we uniqualized them and stored the frequencies in a separate file `freq`. Now let's append the frequencies to the resulting alignments." + ], + "metadata": { + "id": "vpqiKrS6XBlP" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fxZ5jjUPlOFv" + }, + "outputs": [], + "source": [ + "!paste -d\"\\t\" {ALIGNMENT_DIR}/date/freq {ALIGNMENT_DIR}/date/itn.out > {ALIGNMENT_DIR}/date/itn.out2\n", + "!paste -d\"\\t\" {ALIGNMENT_DIR}/letters/freq {ALIGNMENT_DIR}/letters/itn.out > {ALIGNMENT_DIR}/letters/itn.out2\n", + "!paste -d\"\\t\" {ALIGNMENT_DIR}/cardinal/freq {ALIGNMENT_DIR}/cardinal/itn.out > {ALIGNMENT_DIR}/cardinal/itn.out2" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Let's look at what we get. The output should look like\n", + "![Top of file with aligned expressions](images/thutmose_tagger_alignment_top.png)\n", + "...\n", + "![Bottom of file with aligned expressions](images/thutmose_tagger_alignment_bottom.png)\n" + ], + "metadata": { + "id": "yzt87qeEX5o0" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tJNFvVhG4SMo" + }, + "outputs": [], + "source": [ + "df = pd.read_csv(ALIGNMENT_DIR + \"/cardinal/itn.out2\", sep=\"\\t\", header=None)\n", + "df.columns = [\"freq\", \"verdict\", \"spoken\", \"written initial tokens\", \"left-side alignment\", \"right-side alignment\"]\n", + "is_spoken_multiword = df[\"spoken\"].apply(lambda x: \" \" in x)\n", + "df2 = df[is_spoken_multiword].sort_values(\"freq\", ascending=False).reset_index(drop=True)\n", + "df2.head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cEcXh1MzDWFy" + }, + "outputs": [], + "source": [ + "df2.tail(10)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# 2. Get a restricted vocabulary of target fragments (tags)\n", + "\n", + "There can be some inconsistencies in the automatic alignments, but nevertheless we now have **one-to-one correspondence** between input words and output fragments. Let's collect all fragments in a vocabulary! The output should look like this\n", + "![Tag vocabulary](images/thutmose_tagger_tag_vocabulary.png)\n", + "\n" + ], + "metadata": { + "id": "OdEuRQKXYG3D" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "89zwtEQmQJZ1" + }, + "outputs": [], + "source": [ + "! python {NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_after_alignment.py \\\n", + " --mode=get_replacement_vocab \\\n", + " --giza_dir={ALIGNMENT_DIR} \\\n", + " --alignment_filename=itn.out2 \\\n", + " --data_dir=\"\" \\\n", + " --vocab_filename={WORK_DIR}/replacement_vocab_full.txt \\\n", + " --out_filename=\"\" \\\n", + " --lang={CORPUS_LANG}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tx4gXO7CGzyQ" + }, + "outputs": [], + "source": [ + "df = pd.read_csv(\"replacement_vocab_full.txt.cardinal\", sep=\"\\t\", header=None)\n", + "df.columns = [\"replacement tag\", \"freq\"]\n", + "df" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Tags with low frequencies are likely to be derived from sporadic alignment mistakes, so let's truncate them, and put together the tags from all our semiotic classes." + ], + "metadata": { + "id": "Ts_G3TnLEQn4" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2TKbJELTFFXG" + }, + "outputs": [], + "source": [ + "! head -n 150 replacement_vocab_full.txt.cardinal > replacement_vocab_cardinal.txt\n", + "! head -n 150 replacement_vocab_full.txt.date > replacement_vocab_date.txt\n", + "! head -n 150 replacement_vocab_full.txt.letters > replacement_vocab_letters.txt\n", + "! cat replacement_vocab_cardinal.txt \\\n", + " replacement_vocab_date.txt \\\n", + " replacement_vocab_letters.txt > replacement_vocab.select.txt\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "x6eEYkVlJDK-" + }, + "source": [ + "After concatenation the vocabulary file can contain duplicates of the same tags coming from different semiotic classes, but this is not important at this moment. The final vocabulary will be created later." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DoDHW-p1FUso" + }, + "outputs": [], + "source": [ + "! wc -l replacement_vocab.select.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xcQK5cHQH_NH" + }, + "outputs": [], + "source": [ + "! python {NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_after_alignment.py \\\n", + " --mode=filter_by_vocab \\\n", + " --giza_dir={ALIGNMENT_DIR} \\\n", + " --alignment_filename=itn.out2 \\\n", + " --data_dir=\"\" \\\n", + " --vocab_filename={WORK_DIR}/replacement_vocab.select.txt \\\n", + " --out_filename=itn.select.out \\\n", + " --lang={CORPUS_LANG}\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yWhCQJ5DLgoZ" + }, + "source": [ + "The script `prepare_corpora_after_alignment.py --mode=filter_by_vocab` discards examples that are not fully covered with our selected replacement vocabulary. We can see that number of lines slightly decreases.\n", + "```\n", + "4997 content/alignment/cardinal/itn.out2\n", + "4681 content/alignment/cardinal/itn.select.out\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_nqeRRjmKoWg" + }, + "outputs": [], + "source": [ + "! wc -l {ALIGNMENT_DIR}/cardinal/itn.out2\n", + "! wc -l {ALIGNMENT_DIR}/cardinal/itn.select.out\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kUQwwCkLMKEX" + }, + "source": [ + "The format of lines also slightly changes: we add the name of semiotic class, choose only one alignment(left-side or right-side) based on class, and remove unnecessary columns.\n", + "\n", + "![Final alignment](images/thutmose_tagger_final_alignment.png)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "83Voerh_K8gR" + }, + "outputs": [], + "source": [ + "df = pd.read_csv(ALIGNMENT_DIR + \"/cardinal/itn.select.out\", sep=\"\\t\", header=None)\n", + "df.columns = [\"semiotic class\", \"spoken\", \"written initial fragments\", \"alignment\"]\n", + "df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "C6cAEYNHOKex" + }, + "source": [ + "# 3. Build training dataset \n", + "\n", + "Now it's time to create a tag-labeled dataset, containing _full sentences_. After previous step we got a large dictionary of ITN phrase conversions *that we know how to tag*. Once again we loop through the Google TN dataset and process each sentence in the following way:\n", + "\n", + "* If a sentence contains at least one ITN conversion, that is missing from our dictionary, this sentence is discarded.\n", + "* Otherwise we assign tags to the input words\n", + " 1. All words outside ITN conversion spans are tagged as ``.\n", + " 2. Tags for words inside ITN spans are taken from the dictionary\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DQVCzljrMyHu" + }, + "outputs": [], + "source": [ + "!python {NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_after_alignment.py \\\n", + " --mode=get_labeled_corpus \\\n", + " --giza_dir={ALIGNMENT_DIR} \\\n", + " --alignment_filename=itn.select.out \\\n", + " --data_dir={CORPUS_DIR}/dev \\\n", + " --vocab_filename=\"\" \\\n", + " --out_filename={CORPUS_DIR}/dev.labeled \\\n", + " --lang={CORPUS_LANG}\n", + "\n", + "!python {NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_after_alignment.py \\\n", + " --mode=get_labeled_corpus \\\n", + " --giza_dir={ALIGNMENT_DIR} \\\n", + " --alignment_filename=itn.select.out \\\n", + " --data_dir={CORPUS_DIR}/train \\\n", + " --vocab_filename=\"\" \\\n", + " --out_filename={CORPUS_DIR}/train.labeled \\\n", + " --lang=${CORPUS_LANG}\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bBfuML8TQrwz" + }, + "source": [ + "The resulting file consists of three columns:\n", + "* input words\n", + "* target tags\n", + "* semiotic spans (if any)\n", + "\n", + "The semiotic spans are separated by semicolon, each span consists of class, begin and end in terms of input word positions e.g. \"DATE 6 9\".\n", + "\n", + "```\n", + "it can be summarized as an error driven transformation based tagger\t \t\n", + "this plan was first enacted in nineteen eighty four and continued to be followed for nineteen years\t _19 8 4_ _19_ \tDATE 6 9;CARDINAL 15 16\n", + "```\n", + "The semiotic spans are used for two purposes:\n", + " \n", + "1. During validation step we calculate accuracy w.r.t. semiotic spans. For example, a DATE span is correct if **all** tag predictions inside this span match the ground truth labels.\n", + "2. The model has additional classiffication head that predicts a semiotic class label for each of the input words. These predictions are used in the post-processing step for better handling of swaps.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-TjToCTDN8t8" + }, + "outputs": [], + "source": [ + "! head {CORPUS_DIR}/dev.labeled" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Bf9Evpn8RWp4" + }, + "source": [ + "Get the final label vocabulary, based on our labeled corpora. The output file should look like this\n", + "```\n", + "KEEP\n", + "DELETE\n", + "DELETE|_20\n", + "DELETE|_19\n", + "DELETE|_2\n", + "DELETE|_200\n", + "DELETE|,20\n", + "DELETE|9_\n", + "DELETE|9\n", + "DELETE|8_\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2RY2pZwEPdlZ" + }, + "outputs": [], + "source": [ + "! python {NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/get_label_vocab.py \\\n", + " --train_filename={CORPUS_DIR}/train.labeled \\\n", + " --dev_filename={CORPUS_DIR}/dev.labeled \\\n", + " --out_filename={CORPUS_DIR}/label_map.txt\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9cGtBQSwRj-p" + }, + "outputs": [], + "source": [ + "! head {CORPUS_DIR}/label_map.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KL4DINweSgUQ" + }, + "outputs": [], + "source": [ + "! echo \"ADDRESS\" > {CORPUS_DIR}/semiotic_classes.txt\n", + "! echo \"CARDINAL\" >> {CORPUS_DIR}/semiotic_classes.txt\n", + "! echo \"DATE\" >> {CORPUS_DIR}/semiotic_classes.txt\n", + "! echo \"DECIMAL\" >> {CORPUS_DIR}/semiotic_classes.txt\n", + "! echo \"DIGIT\" >> {CORPUS_DIR}/semiotic_classes.txt\n", + "! echo \"ELECTRONIC\" >> {CORPUS_DIR}/semiotic_classes.txt\n", + "! echo \"FRACTION\" >> {CORPUS_DIR}/semiotic_classes.txt\n", + "! echo \"LETTERS\" >> {CORPUS_DIR}/semiotic_classes.txt\n", + "! echo \"MEASURE\" >> {CORPUS_DIR}/semiotic_classes.txt\n", + "! echo \"MONEY\" >> {CORPUS_DIR}/semiotic_classes.txt\n", + "! echo \"ORDINAL\" >> {CORPUS_DIR}/semiotic_classes.txt\n", + "! echo \"PLAIN\" >> {CORPUS_DIR}/semiotic_classes.txt\n", + "! echo \"PUNCT\" >> {CORPUS_DIR}/semiotic_classes.txt\n", + "! echo \"TELEPHONE\" >> {CORPUS_DIR}/semiotic_classes.txt\n", + "! echo \"TIME\" >> {CORPUS_DIR}/semiotic_classes.txt\n", + "! echo \"VERBATIM\" >> {CORPUS_DIR}/semiotic_classes.txt\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "b7JrZxs-WTS8" + }, + "outputs": [], + "source": [ + "! mkdir {WORK_DIR}/datasets\n", + "\n", + "! cp {CORPUS_DIR}/label_map.txt {WORK_DIR}/datasets/label_map.txt\n", + "! cp {CORPUS_DIR}/semiotic_classes.txt {WORK_DIR}/datasets/semiotic_classes.txt\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "E-fXGmAb63z0" + }, + "source": [ + "Now the file `train.labeled` contains all sentences from initial Google TN data, that we have been able to cover with out tag dictionary. \n", + "From it we can create different datasets for our neural model, trying different sizes and sampling strategies.\n", + "\n", + "Let's create a toy dataset of 5'000 sentences for train set and 5'000 sentences for dev set. Test set is not used - see Evaluation section below." + ] + }, + { + "cell_type": "code", + "source": [ + "DATASET = WORK_DIR + \"/datasets/itn_sample10k\"\n", + "! mkdir {DATASET}\n", + "!head -n 5000 {CORPUS_DIR}/train.labeled > {DATASET}/train.tsv\n", + "!head -n 5000 {CORPUS_DIR}/dev.labeled > {DATASET}/valid.tsv\n", + "!cp {DATASET}/valid.tsv {DATASET}/test.tsv\n" + ], + "metadata": { + "id": "KFwzGmuJlC0N" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# 4. Train a token classifier neural model\n", + "Now let's run training" + ], + "metadata": { + "id": "X1vWojxlmffT" + } + }, + { + "cell_type": "code", + "source": [ + "! python {NEMO_PATH}/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_train.py \\\n", + " lang=en \\\n", + " data.validation_ds.data_path={DATASET}/valid.tsv \\\n", + " data.train_ds.data_path={DATASET}/train.tsv \\\n", + " model.language_model.pretrained_model_name=bert-base-uncased \\\n", + " model.label_map={WORK_DIR}/datasets/label_map.txt \\\n", + " model.semiotic_classes={WORK_DIR}/datasets/semiotic_classes.txt \\\n", + " trainer.accelerator=gpu \\\n", + " trainer.max_epochs=1\n" + ], + "metadata": { + "id": "APBdPcihmFBa" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Each validation step generates three classification reports where rows correspond to different semiotic classes, `support` column is how many examples of this class occurred in the target of validation set, and `recall` column is the classifier **accuracy** on this class, i.e. percentage of _whole examples_ whose predicted tags match the target.\n", + "\n", + "1. Tag classification report. `PLAIN` class includes words that are tagged as ``.\n", + "2. Tag classification report for **multiword** examples only. They are less trivial and it is harder to achieve high accuracy on them.\n", + "3. Classification report for semiotic classes." + ], + "metadata": { + "id": "kjZU6fkvS0V5" + } + }, + { + "cell_type": "code", + "source": [ + "# the log can be found in nemo_experiments folder\n", + "! cat nemo_experiments/training/*/nemo_log_globalrank-0_localrank-0.txt" + ], + "metadata": { + "id": "gO1nez6AWJeW" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Inference" + ], + "metadata": { + "id": "hX-9t7XBqJbo" + } + }, + { + "cell_type": "markdown", + "source": [ + "Let's run the inference of our toy model.\n", + "First, copy the model, that we've just trained." + ], + "metadata": { + "id": "9x80qIKCsBQ7" + } + }, + { + "cell_type": "code", + "source": [ + "!cp nemo_experiments/training/*/checkpoints/training.nemo ." + ], + "metadata": { + "id": "dYfyklDTXuUM" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Generate some input sentences." + ], + "metadata": { + "id": "pVCV2Hchs-gG" + } + }, + { + "cell_type": "code", + "source": [ + "! echo \"on the ninth of may four days after her arrival at new orleans west carnifax was decommissioned and returned to the u s s b\" > test_sent.txt\n", + "! echo \"retrieved the fourth of october twenty fifteen\" >> test_sent.txt" + ], + "metadata": { + "id": "30KlsQ6uY6vu" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Run the inference." + ], + "metadata": { + "id": "uqyBEKn-tDXe" + } + }, + { + "cell_type": "code", + "source": [ + "!python {NEMO_PATH}/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_infer.py \\\n", + " pretrained_model=./training.nemo \\\n", + " inference.from_file=./test_sent.txt \\\n", + " inference.out_file=./test_sent.output" + ], + "metadata": { + "id": "SDSm6lg6ZOM_" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "! cat test_sent.output" + ], + "metadata": { + "id": "jrGJb9DcZ83E" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The inference output consists of 5 columns:\n", + "\n", + "1. Final output text.\n", + "2. Input text.\n", + "3. Sequence of predicted tags.\n", + "4. Sequence of tags after post-processing (some swaps may be applied).\n", + "5. Sequence of predicted semiotic classes - one class for each input word.\n", + "\n", + "```\n", + "on ninth may four days after her arrival at new orleans west carnifax was decommissioned and returned to the u s s b\ton the ninth of may four days after her arrival at new orleans west carnifax was decommissioned and returned to the u s s b\t \t \tPLAIN DATE DATE DATE DATE DATE PLAIN PLAIN PLAIN PLAIN PLAIN PLAIN PLAIN PLAIN PLAIN PLAIN PLAIN PLAIN PLAIN PLAIN PLAIN LETTERS LETTERS LETTERS LETTERS\n", + "retrieved 20 october 20 20\tretrieved the fourth of october twenty fifteen\t _20 _20 _20\t _20 _20 _20\tPLAIN DATE DATE DATE DATE DATE DATE```\n" + ], + "metadata": { + "id": "eYqtL7waaiZS" + } + }, + { + "cell_type": "markdown", + "source": [ + "We see that our toy model works and even manages to replace some numbers.\n", + "\n", + "To train a full-fledged model, you need more data.\n", + "\n", + "See also the scripts for the whole pipeline:\n", + "\n", + "> [prepare_dataset_en.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/text_normalization_as_tagging/prepare_dataset_en.sh)\n", + "\n", + "> [normalization_as_tagging_train.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_train.py)\n", + "\n", + "> [run_infer.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/text_normalization_as_tagging/run_infer.sh)\n", + "\n" + ], + "metadata": { + "id": "AY9sQCIcUEGO" + } + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "ITN_with_Thutmose_Tagger.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tutorials/text_processing/images/thutmose_tagger_alignment_bottom.png b/tutorials/text_processing/images/thutmose_tagger_alignment_bottom.png new file mode 100644 index 000000000000..03caff6667c2 Binary files /dev/null and b/tutorials/text_processing/images/thutmose_tagger_alignment_bottom.png differ diff --git a/tutorials/text_processing/images/thutmose_tagger_alignment_top.png b/tutorials/text_processing/images/thutmose_tagger_alignment_top.png new file mode 100644 index 000000000000..dcdad57c4e08 Binary files /dev/null and b/tutorials/text_processing/images/thutmose_tagger_alignment_top.png differ diff --git a/tutorials/text_processing/images/thutmose_tagger_architecture.png b/tutorials/text_processing/images/thutmose_tagger_architecture.png new file mode 100644 index 000000000000..4729ce51b425 Binary files /dev/null and b/tutorials/text_processing/images/thutmose_tagger_architecture.png differ diff --git a/tutorials/text_processing/images/thutmose_tagger_final_alignment.png b/tutorials/text_processing/images/thutmose_tagger_final_alignment.png new file mode 100644 index 000000000000..dec47fec5143 Binary files /dev/null and b/tutorials/text_processing/images/thutmose_tagger_final_alignment.png differ diff --git a/tutorials/text_processing/images/thutmose_tagger_tag_vocabulary.png b/tutorials/text_processing/images/thutmose_tagger_tag_vocabulary.png new file mode 100644 index 000000000000..5c97a0cb09ce Binary files /dev/null and b/tutorials/text_processing/images/thutmose_tagger_tag_vocabulary.png differ