diff --git a/Jenkinsfile b/Jenkinsfile index ea84290171e8..e2c73a9f9d04 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -124,12 +124,12 @@ pipeline { parallel { stage('En TN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-26' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-26' } } stage('En ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-26' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-26' } } stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') { @@ -153,7 +153,7 @@ pipeline { stage('L2: Eng TN') { steps { sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-26 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1' - sh 'cd nemo_text_processing/text_normalization/ && python run_predict.py --input=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose' + sh 'cd nemo_text_processing/text_normalization/ && python normalize.py --input_file=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose' sh 'cat /home/TestData/nlp/text_norm/output/test.pynini.txt' sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_04-14.txt || exit 1' sh 'rm -rf /home/TestData/nlp/text_norm/output/*' @@ -163,7 +163,7 @@ pipeline { stage('L2: Eng ITN export') { steps { sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-26 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1' - sh 'cd nemo_text_processing/inverse_text_normalization/ && python run_predict.py --input=/home/TestData/nlp/text_denorm/ci/test.txt --language=en --output=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose' + sh 'cd nemo_text_processing/inverse_text_normalization/ && python inverse_normalize.py --input_file=/home/TestData/nlp/text_denorm/ci/test.txt --language=en --output_file=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose' sh 'cmp --silent /home/TestData/nlp/text_denorm/output/test.pynini.txt /home/TestData/nlp/text_denorm/ci/test_goal_py.txt || exit 1' sh 'rm -rf /home/TestData/nlp/text_denorm/output/*' } diff --git a/docs/source/nlp/text_normalization/intro.rst b/docs/source/nlp/text_normalization/intro.rst index 5cb35408d849..e560372f8831 100644 --- a/docs/source/nlp/text_normalization/intro.rst +++ b/docs/source/nlp/text_normalization/intro.rst @@ -1,8 +1,6 @@ (Inverse) Text Normalization ============================ -NeMo supports Text Normalization (TN) and Inverse Text Normalization (ITN) tasks via rule-based `nemo_text_processing` python package and Neural-based TN/ITN model. - Rule-based (WFST) TN/ITN: .. toctree:: diff --git a/docs/source/nlp/text_normalization/wfst/images/deployment_pipeline.png b/docs/source/nlp/text_normalization/wfst/images/deployment_pipeline.png new file mode 100644 index 000000000000..9e8aa3750eb7 Binary files /dev/null and b/docs/source/nlp/text_normalization/wfst/images/deployment_pipeline.png differ diff --git a/docs/source/nlp/text_normalization/wfst/images/task_overview.png b/docs/source/nlp/text_normalization/wfst/images/task_overview.png new file mode 100644 index 000000000000..98aefe0364fa Binary files /dev/null and b/docs/source/nlp/text_normalization/wfst/images/task_overview.png differ diff --git a/docs/source/nlp/text_normalization/wfst/intro.rst b/docs/source/nlp/text_normalization/wfst/intro.rst index f79c576dceec..316e2a693bf2 100644 --- a/docs/source/nlp/text_normalization/wfst/intro.rst +++ b/docs/source/nlp/text_normalization/wfst/intro.rst @@ -1,22 +1,15 @@ WFST-based (Inverse) Text Normalization ======================================= -NeMo supports Text Normalization (TN) and Inverse Text Normalization (ITN) tasks via rule-based `nemo_text_processing` python package and Neural-based TN/ITN model. +NeMo supports Text Normalization (TN), audio-based TN and Inverse Text Normalization (ITN) tasks. -`nemo_text_processing` that is installed with the `nemo_toolkit`, see :doc:`NeMo Introduction <../starthere/intro>` for installation details. -Additional requirements can be found in `setup.sh `_. - -Tutorials on how to get started with WFST-based NeMo text normalization could be found `tutorials/text_processing `_. - -Rule-based (WFST) TN/ITN: +WFST-based TN/ITN: .. toctree:: :maxdepth: 2 wfst_text_normalization - wfst_inverse_text_normalization wfst_text_processing_deployment - wfst_api diff --git a/docs/source/nlp/text_normalization/wfst/wfst_api.rst b/docs/source/nlp/text_normalization/wfst/wfst_api.rst deleted file mode 100755 index bd6cfd9cedcc..000000000000 --- a/docs/source/nlp/text_normalization/wfst/wfst_api.rst +++ /dev/null @@ -1,37 +0,0 @@ -.. _wfst_api: - -NeMo Text Processing API -======================== - -Model Classes -------------- - -.. autoclass:: nemo_text_processing.text_normalization.en.GraphFst - :show-inheritance: - :members: - -.. autoclass:: nemo_text_processing.text_normalization.en.ClassifyFst - :show-inheritance: - :members: - - -.. autoclass:: nemo_text_processing.text_normalization.en.VerbalizeFst - :show-inheritance: - :members: - -.. autoclass:: nemo_text_processing.text_normalization.en.VerbalizeFinalFst - :show-inheritance: - :members: - -.. autoclass:: nemo_text_processing.inverse_text_normalization.en.ClassifyFst - :show-inheritance: - :members: - -.. autoclass:: nemo_text_processing.inverse_text_normalization.en.VerbalizeFst - :show-inheritance: - :members: - -.. autoclass:: nemo_text_processing.inverse_text_normalization.en.VerbalizeFinalFst - :show-inheritance: - :members: - diff --git a/docs/source/nlp/text_normalization/wfst/wfst_inverse_text_normalization.rst b/docs/source/nlp/text_normalization/wfst/wfst_inverse_text_normalization.rst deleted file mode 100644 index 3d03d0230fb3..000000000000 --- a/docs/source/nlp/text_normalization/wfst/wfst_inverse_text_normalization.rst +++ /dev/null @@ -1,110 +0,0 @@ -.. _wfst_itn: - -Inverse Text Normalization -========================== - -Inverse text normalization (ITN) is a part of the Automatic Speech Recognition (ASR) post-processing pipeline. -ITN is the task of converting the raw spoken output of the ASR model into its written form to improve text readability. - -Quick Start Guide ------------------ - -Integrate ITN to a text processing pipeline: -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: python - - # import WFST-based ITN module - from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer - - # initialize inverse normalizer - inverse_normalizer = InverseNormalizer(lang="en", cache_dir="CACHE_DIR") - - # try normalizer on a few examples - print(inverse_normalizer.normalize("it costs one hundred and twenty three dollars")) - # >>>"it costs $123" - - print(inverse_normalizer.normalize("in nineteen seventy")) - # >>> "in 1970" - - -Run prediction: -^^^^^^^^^^^^^^^ - -.. code:: - - # run prediction on - python run_predict.py --input= --output= --lang= \ - [--verbose] - - # single input prediction - python inverse_normalize.py --lang= \ - [--verbose] [--overwrite_cache] [--cache_dir=] - - -The input is expected to be lower-cased. Punctuation are outputted with separating spaces after semiotic tokens, e.g. `"i see, it is ten o'clock..."` -> `"I see, it is 10:00 . . ."`. -Inner-sentence white-space characters in the input are not maintained. -See the above scripts for more details. - - -NeMo ITN :cite:`textprocessing-itn-zhang2021nemo` is based on WFST-grammars (:cite:`textprocessing-itn-mohri2005weighted`, :cite:`textprocessing-itn-mohri2009weighted`). We also provide a deployment route to C++ using `Sparrowhawk `_ :cite:`textprocessing-itn-sparrowhawk` -- an open-source version of Google Kestrel :cite:`textprocessing-itn-ebden2015kestrel`. -See :doc:`Text Procesing Deployment <../tools/text_processing_deployment>` for details. - -.. note:: - - For more details, see the tutorial `NeMo/tutorials/text_processing/Inverse_Text_Normalization.ipynb `__ in `Google's Colab `_. - - -Evaluation ----------- - -Example evaluation run on (cleaned) `Google's text normalization dataset `__ :cite:`textprocessing-itn-sproat2016rnn`: - -.. code:: - - python run_evaluate.py --input=./en_with_types/output-00001-of-00100 --lang \ - [--cat CLASS_CATEGORY] [--filter] - -Supported Languages -------------------- - -ITN supports: English, Spanish, German, French, Vietnamese, and Russian languages. - -Classes --------- - -The base class for every grammar is :class:`GraphFst`. -This tool is designed as a two-stage application: 1. `classification` of the input into semiotic tokens and 2. `verbalization` into written form. -For every stage and every semiotic token class there is a corresponding grammar, e.g. :class:`taggers.CardinalFst` -and :class:`verbalizers.CardinalFst`. -Together, they compose the final grammars :class:`ClassifyFst` and -:class:`VerbalizeFinalFst` that are compiled into WFST and used for inference. - - - -.. autoclass:: nemo_text_processing.inverse_text_normalization.en.ClassifyFst - :show-inheritance: - :members: - -.. autoclass:: nemo_text_processing.inverse_text_normalization.en.VerbalizeFinalFst - :show-inheritance: - :members: - - -Installation ------------- - -`nemo_text_processing` is installed with the `nemo_toolkit`. - -See :doc:`NeMo Introduction <../starthere/intro>` for installation details. - -Additional requirements can be found in `setup.sh `_. - - -References ----------- - -.. bibliography:: ../tn_itn_all.bib - :style: plain - :labelprefix: TEXTPROCESSING-ITN - :keyprefix: textprocessing-itn- \ No newline at end of file diff --git a/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst b/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst index 1c1ad949fe43..da481d2f0b5f 100644 --- a/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst +++ b/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst @@ -1,184 +1,130 @@ .. _wfst_tn: -Text Normalization -================== +Text (Inverse) Normalization +============================ -NeMo Text Normalization converts text from written form into its verbalized form. It is used as a preprocessing step before Text to Speech (TTS). It could also be used for preprocessing Automatic Speech Recognition (ASR) training transcripts. +The `nemo_text_processing` Python package :cite:`textprocessing-norm-zhang2021nemo` is based on WFST grammars :cite:`textprocessing-norm-mohri2005weighted` and supports: +1. Text Normalization (TN) converts text from written form into its verbalized form. It is used as a preprocessing step before Text to Speech (TTS). For example, -Quick Start Guide ------------------ - -Integrate TN to a text processing pipeline: -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: python - - # import WFST-based TN module - from nemo_text_processing.text_normalization.normalize import Normalizer - - # initialize normalizer - normalizer = Normalizer(input_case="cased", lang="en") - - # try normalizer on a few examples - print(normalizer.normalize("123")) - # >>> one hundred twenty three - print(normalizer.normalize_list(["at 10:00", "it weights 10kg."], punct_post_process=True)) - # >>> ["at ten o'clock", 'it weights ten kilograms.'] - - -Run prediction: -^^^^^^^^^^^^^^^ +.. code-block:: bash -.. code:: + "123" -> "one hundred twenty three" - # run prediction on - python run_predict.py --input= --output= --lang= \ - [--input_case=] +2. Inverse text normalization (ITN) is a part of the Automatic Speech Recognition (ASR) post-processing pipeline and can be used to convert normalized ASR model outputs into written form to improve text readability. For example, + +.. code-block:: bash + + "one hundred twenty three" -> "123" - # single input prediction - python normalize.py --lang= \ - [--verbose] [--overwrite_cache] [--cache_dir=] [--input_case=] +3. Audio-based provides multiple normalization options. For example, +.. code-block:: bash + + "123" -> "one hundred twenty three", "one hundred and twenty three", "one two three", "one twenty three" ... -``INPUT_CASE`` specifies whether to treat the input as lower-cased or case sensitive. By default treat the input as cased since this is more informative, especially for abbreviations. Punctuation are outputted with separating spaces after semiotic tokens, e.g. `"I see, it is 10:00..."` -> `"I see, it is ten o'clock . . ."`. -Inner-sentence white-space characters in the input are not maintained. +The normalization which best reflects what is actually said in an audio is then picked. +Audio-based TN can be used to normalize ASR training data. + .. image:: images/task_overview.png + :align: center + :alt: Text TN and ITN + :scale: 50% -NeMo Text Normalization :cite:`textprocessing-norm-zhang2021nemo` is based on WFST-grammars :cite:`textprocessing-norm-mohri2005weighted` and :cite:`textprocessing-norm-mohri2009weighted`. \ -We also provide a deployment route to C++ using `Sparrowhawk `_ :cite:`textprocessing-norm-sparrowhawk` -- an open-source version of Google Kestrel :cite:`textprocessing-norm-ebden2015kestrel`. -See :doc:`Text Procesing Deployment ` for details. - - -.. note:: - - For more details, see the tutorial `NeMo/tutorials/text_processing/Text_Normalization.ipynb `__ in `Google's Colab `_. - - -Evaluation ----------- - -Example evaluation run on `Google's text normalization dataset `__ :cite:`textprocessing-norm-sproat2016rnn`: -.. code:: - - python run_evaluate.py --input=./en_with_types/output-00001-of-00100 --lang=en \ - [--cat CLASS_CATEGORY] [--input_case INPUT_CASE] - - -Classes -------- - -The base class for every grammar is :class:`GraphFst`. -This tool is designed as a two-stage application: 1. `classification` of the input into semiotic tokens and 2. `verbalization` into written form. -For every stage and every semiotic token class there is a corresponding grammar, e.g. :class:`taggers.CardinalFst` -and :class:`verbalizers.CardinalFst`. -Together, they compose the final grammars :class:`ClassifyFst` and -:class:`VerbalizeFinalFst` that are compiled into WFST and used for inference. - - -.. autoclass:: nemo_text_processing.text_normalization.en.ClassifyFst - :show-inheritance: - :members: - -.. autoclass:: nemo_text_processing.text_normalization.en.VerbalizeFinalFst - :show-inheritance: - :members: +Installation +------------ -Audio-based Text Normalization -============================== +`nemo_text_processing` is automatically installed with `NeMo `_. Quick Start Guide ----------------- -To normalize text that has corresponding audio recording, it is recommened to use `nemo_text_processing/text_normalization/normalize_with_audio.py `__ script \ -that provides multiple normalization options and chooses the one that minimizes character error rate (CER) of the automatic speech recognition (ASR) output. -The main difference between the default normalization and the audio-based one, is that most of the semiotic classes use deterministic=False flag. - -.. code-block:: python - # import WFST-based non-deterministic TN module - from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio +Text Normalization +^^^^^^^^^^^^^^^^^^ - # initialize normalizer - normalizer = NormalizerWithAudio( - lang="en", - input_case="cased", - overwrite_cache=False, - cache_dir="cache_dir", - ) - # try normalizer on a few examples - print(normalizer.normalize("123", n_tagged=10, punct_post_process=True)) - # >>> {'one hundred twenty three', 'one hundred and twenty three', 'one twenty three', 'one two three'} +.. code-block:: bash + cd NeMo/nemo_text_processing/text_normalization/ + python normalize.py --text="123" --language=en -To run this script with a .json manifest file, the manifest file should contain the following fields: -Parameters to run audio-based normalization (more details could be found in `nemo_text_processing/text_normalization/normalize_with_audio.py `__) +Inverse Text Normalization +^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. list-table:: Parameters to run audio-based normalization - :widths: 10 10 - :header-rows: 1 +.. code-block:: bash - * - **Parameter** - - **Description** - * - **audio_data** - - path to the audio file - * - **text** - - raw text - * - **pred_text** - - ASR model prediction - * - **n_tagged** - - Number of tagged options to return, -1 - return all possible tagged options + cd NeMo/nemo_text_processing/inverse_text_normalization/ + python inverse_normalize.py --text="one hundred twenty three" --language=en -See `examples/asr/transcribe_speech.py `__ on how to add ASR predictions. +Arguments: -When the manifest is ready, run: +* ``text`` - Input text. +* ``input_file`` - Input file with lines of input text. Only one of ``text`` or ``input_file`` is accepted. +* ``output_file`` - Output file to save normalizations. Needed if ``input_file`` is specified. +* ``language`` - language id. +* ``input_case`` - Only for text normalization. ``lower_cased`` or ``cased``. +* ``verbose`` - Outputs intermediate information. +* ``cache_dir`` - Specifies a cache directory for compiled grammars. If grammars exist, this significantly improves speed. +* ``overwrite_cache`` - Updates grammars in cache. +* ``whitelist`` - TSV file with custom mappings of written text to spoken form. -.. code-block:: python - python normalize_with_audio.py \ - --audio_data PATH/TO/MANIFEST.JSON \ - --language en +Audio-based TN +^^^^^^^^^^^^^^^^^^ -To run with a single audio file, specify path to audio and text with: +.. code-block:: bash - .. code-block:: python + cd NeMo/nemo_text_processing/text_normalization/ + python normalize_with_audio.py --text="123" --language="en" --n_tagged=10 --cache_dir="cache_dir" --audio_data="example.wav" --model="stt_en_conformer_ctc_large" - python normalize_with_audio.py \ - --audio_data PATH/TO/AUDIO.WAV \ - --language en \ - --text raw text OR PATH/TO/.TXT/FILE - --model QuartzNet15x5Base-En \ - --verbose +Additional Arguments: -To see possible normalization options for a text input without an audio file (could be used for debugging), run: +* ``text`` - Input text or `JSON manifest file `_ with multiple audio paths. +* ``audio_data`` - (Optional) Input audio. +* ``model`` - `Off-shelf NeMo CTC ASR model name `_ or path to local NeMo model checkpoint ending on .nemo +* ``n_tagged`` - number of normalization options to output. - .. code-block:: python - python python normalize_with_audio.py --text "RAW TEXT" --cache_dir "" - -Specify `--cache_dir` to generate .far grammars once and re-used them for faster inference. - -See `nemo_text_processing/text_normalization/normalize_with_audio.py `__ for more arguments. +.. note:: + More details can be found in `NeMo/tutorials/text_processing/Text_(Inverse)_Normalization.ipynb `__ in `Google's Colab `_. + +Language Support Matrix +------------------------ + ++------------------+----------+----------+----------+--------------------+ +| **Language** | **ID** | **TN** | **ITN** | **Audio-based TN** | ++------------------+----------+----------+----------+--------------------+ +| English | en | x | x | x | ++------------------+----------+----------+----------+--------------------+ +| Spanish | es | x | x | x | ++------------------+----------+----------+----------+--------------------+ +| German | de | x | x | x | ++------------------+----------+----------+----------+--------------------+ +| French | fr | | x | | ++------------------+----------+----------+----------+--------------------+ +| Russian | ru | | x | x | ++------------------+----------+----------+----------+--------------------+ +| Vietnamese | vi | | x | | ++------------------+----------+----------+----------+--------------------+ + +Grammar customization +--------------------- -Supported Languages -------------------- +.. note:: -Deterministic TN supports: English, German and Spanish languages. -Non-deterministic (audio-based) TN supports: English, German, Spanish, and Russian languages. + In-depth walk through `NeMo/tutorials/text_processing/WFST_tutorial.ipynb `__ in `Google's Colab `_. -Installation ------------- -`nemo_text_processing` is installed with the `nemo_toolkit`. +Deploy to C++ +----------------- +See :doc:`Text Procesing Deployment ` for details. -See :doc:`NeMo Introduction <../starthere/intro>` for installation details. -Additional requirements can be found in `setup.sh `_. References ---------- diff --git a/docs/source/nlp/text_normalization/wfst/wfst_text_processing_deployment.rst b/docs/source/nlp/text_normalization/wfst/wfst_text_processing_deployment.rst index 9927ca2cd32a..d40926dad5d4 100644 --- a/docs/source/nlp/text_normalization/wfst/wfst_text_processing_deployment.rst +++ b/docs/source/nlp/text_normalization/wfst/wfst_text_processing_deployment.rst @@ -1,81 +1,93 @@ .. _wfst_deployment: -NeMo Text Processing Deployment -=============================== +Deploy to Production with C++ backend +===================================== + +NeMo provides tools to deploy :doc:`TN and ITN ` for production :cite:`textprocessing-deployment-zhang2021nemo`. +It uses `Sparrowhawk `_ :cite:`textprocessing-deployment-sparrowhawk` -- an open-source C++ framework by Google. +The grammars written with NeMo can be exported into an `OpenFST `_ Archive File (FAR) and dropped into Sparrowhawk. + + .. image:: images/deployment_pipeline.png + :align: center + :alt: Deployment pipeline + :scale: 50% -NeMo provides a tool for deployment of :doc:`NeMo Inverse Text Normalization (ITN) ` and :doc:`NeMo Text Normalization (TN) ` for production :cite:`textprocessing-deployment-zhang2021nemo`. -It uses `Sparrowhawk `_ :cite:`textprocessing-deployment-sparrowhawk` -- an open-source version of Google Kestrel :cite:`textprocessing-deployment-ebden2015kestrel`. -The scripts for deployment could be found at `NeMo/tools/text_processing_deployment `_. Requirements ------------ -:doc:`nemo_text_processing ` package +* :doc:`nemo_text_processing ` package +* `Docker `_ +* `NeMo source code `_ -Usage ------ +.. _wfst_deployment_quick_start: +Quick Start +----------- -Starts docker container with production backend with plugged in grammars. This is entry point script. +Examples how to run: -Arguments: -^^^^^^^^^ -* ``GRAMMARS`` - ``tn_grammars`` or ``itn_grammars`` to export either TN or ITN grammars from :doc:`WFST ITN ` or :doc:`WFST TN `. -* ``LANGUAGE`` - `en` for English -* ``INPUT_CASE`` - ``cased`` or ``lower_cased`` (lower_cased is supported only in TN grammars). -* ``MODE`` - choose ``test`` to run test on the grammars inside the container. +.. code-block:: bash -For example: + # export English TN grammars and return prompt inside docker container + cd NeMo/tools/text_processing_deployment + bash export_grammars.sh --GRAMMARS=tn_grammars --LANGUAGE=en --INPUT_CASE=cased + # export English ITN grammars and return prompt inside docker container + cd NeMo/tools/text_processing_deployment + bash export_grammars.sh --GRAMMARS=itn_grammars --LANGUAGE=en -.. code-block:: bash - # to export ITN grammars - cd NeMo/tools/text_processing_deployment - bash export_grammar.sh --GRAMMARS=itn_grammars --LANGUAGE=en +Arguments: +^^^^^^^^^^ +* ``GRAMMARS`` - ``tn_grammars`` or ``itn_grammars`` to export either TN or ITN grammars. +* ``LANGUAGE`` - `en` for English. Click :doc:`here ` for full list of languages. +* ``INPUT_CASE`` - ``cased`` or ``lower_cased`` (ITN has no differentiation between these two, only used for TN). +* ``MODE`` - By default ``export`` which returns prompt inside the docker. If ``--MODE=test`` runs NeMo pytests inside container. +* ``OVERWRITE_CACHE`` - Whether to re-export grammars or load from cache. By default ``True``. +* ``FORCE_REBUILD`` - Whether to rebuild docker image in cased of updated dependencies. By default ``False``. - # to export and test TN grammars - bash export_grammar.sh --GRAMMARS=itn_grammars --INPUT_CASE=cased --MODE=test --LANGUAGE=en +Detailed pipeline +----------------- -This script runs the following steps in sequence: +`export_grammars.sh` runs the following steps in sequence: -Exports grammar `ClassifyFst` and `VerbalizeFst` from :doc:`nemo_text_processing ` to `OUTPUT_DIR/classify/tokenize_and_classify.far` and `OUTPUT_DIR/verbalize/verbalize.far` respectively. +Go to script folder: .. code-block:: bash cd NeMo/tools/text_processing_deployment - python pynini_export.py <--output_dir OUTPUT_DIR> <--grammars GRAMMARS> <--input_case INPUT_CASE> <--language LANGUAGE> -Builds C++ production backend docker +1. Grammars written in Python are exported to `OpenFST `_ archive files (FAR). Specifically, grammars `ClassifyFst` and `VerbalizeFst` from :doc:`nemo_text_processing ` are exported and saved to `./LANGUAGE/classify/tokenize_and_classify.far` and `./LANGUAGE/verbalize/verbalize.far` respectively. .. code-block:: bash - cd NeMo/tools/text_processing_deployment - bash docker/build.sh + python pynini_export.py <--output_dir .> <--grammars GRAMMARS> <--input_case INPUT_CASE> <--language LANGUAGE> +.. warning:: -Plugs in grammars into production backend by mounting grammar directory `classify/` and `verbalize/` with sparrowhawk grammar directory inside docker. Returns docker prompt + TN and ITN grammars are saved to the same file by default. + +2. Docker image is built with dependencies, including `Thrax `_ and `Sparrowhawk `_. .. code-block:: bash - cd NeMo/tools/text_processing_deployment - # to launch container with the exported grammars - bash docker/launch.sh + bash docker/build.sh - # to launch container with the exported grammars and run tests on TN grammars - bash docker/launch.sh test_tn_grammars +3. Plugs in grammars into production backend by mounting grammar directory `LANGUAGE/classify/` and `LANGUAGE/verbalize/` inside docker. Returns docker prompt. - # to launch container with the exported grammars and run tests on ITN grammars - bash docker/launch.sh test_itn_grammars +.. code-block:: bash + # launch container with the exported grammars + bash docker/launch.sh -Runs TN or ITN in docker container: +4. Runs system in docker container. .. code-block:: bash - echo "two dollars fifty" | ../../src/bin/normalizer_main --config=sparrowhawk_configuration.ascii_proto + echo "ITN result: two dollars fifty. TN result: $2.50" | ../../src/bin/normalizer_main --config=sparrowhawk_configuration.ascii_proto -This returns $2.50 for ITN. +This returns "ITN result: $2.50. TN result: two dollars fifty cents" References ---------- diff --git a/docs/source/starthere/tutorials.rst b/docs/source/starthere/tutorials.rst index a526b28130e5..280c0555bc40 100644 --- a/docs/source/starthere/tutorials.rst +++ b/docs/source/starthere/tutorials.rst @@ -146,14 +146,11 @@ To run a tutorial: - CTC Segmentation - `CTC Segmentation `_ * - Text Processing - - Text Normalization for TTS - - `Text Normalization `_ - * - Text Processing - - Inverse Text Normalization for ASR - - `Inverse Text Normalization `_ + - Text Normalization and Inverse Normalization for ASR and TTS + - `Text Normalization `_ * - Text Processing - Inverse Text Normalization for ASR - Thutmose Tagger - `Inverse Text Normalization with Thutmose Tagger `_ * - Text Processing - Constructing Normalization Grammars with WFSTs - - `WFST Tutorial `_ + - `WFST Tutorial `_ diff --git a/nemo_text_processing/README.md b/nemo_text_processing/README.md index 46cce95b281a..181d6ddc8e3b 100644 --- a/nemo_text_processing/README.md +++ b/nemo_text_processing/README.md @@ -5,4 +5,4 @@ Introduction ------------ NeMo's `nemo_text_processing` is a Python package that is installed with the `nemo_toolkit`. -See [documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/text_normalization/intro.html) for details. \ No newline at end of file +See [documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/wfst_text_normalization.html) for details. \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/README.md b/nemo_text_processing/inverse_text_normalization/README.md index d3525922c2f6..c0dcb24481b8 100644 --- a/nemo_text_processing/inverse_text_normalization/README.md +++ b/nemo_text_processing/inverse_text_normalization/README.md @@ -1,61 +1,10 @@ # Inverse Text Normalization -Inverse text normalization (ITN) is a part of the Automatic Speech Recognition (ASR) post-processing pipeline. -ITN is the task of converting the raw spoken output of the ASR model into its written form to improve text readability. +Inverse Text Normalization is part of NeMo's `nemo_text_processing` - a Python package that is installed with the `nemo_toolkit`. +It converts text from spoken form into written form, e.g. "one hundred twenty three" -> "123". -For example, `one hundred twenty three kilograms` -> `123 kg` +See [NeMo documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/wfst_text_normalization.html) for details. -# Documentation +Tutorial with overview of the package capabilities: [Text_(Inverse)_Normalization.ipynb](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Text_(Inverse)_Normalization.ipynb) -[ITN documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/wfst_inverse_text_normalization.html). - -[TN/INT tutorials NeMo/tutorials/text_processing](https://github.com/NVIDIA/NeMo/tree/stable/tutorials/text_processing). - -# Installation - -``` bash setup.sh ``` - -# Integrate ITN to a text processing pipeline - -``` -# import WFST-based ITN module -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer - -# initialize inverse normalizer -inverse_normalizer = InverseNormalizer(lang="en", cache_dir="CACHE_DIR") - -# try normalizer on a few examples -print(inverse_normalizer.normalize("it costs one hundred and twenty three dollars")) -# >>>"it costs $123" - -print(inverse_normalizer.normalize("in nineteen seventy")) -# >>> "in 1970" -``` - -# Prediction - -``` -# run prediction on -python run_predict.py --input= --output= --lang= \ - [--verbose] - -# single input prediction -python inverse_normalize.py --lang= \ - [--verbose] [--overwrite_cache] [--cache_dir=] -``` - -The input is expected to be lower-cased. Punctuation are outputted with separating spaces after semiotic tokens, e.g. `"i see, it is ten o'clock..."` -> `"I see, it is 10:00 . . ."`. -Inner-sentence white-space characters in the input are not maintained. -See the above scripts for more details. - -# Supported Languages - -ITN supports: English, Spanish, German, French, Vietnamese, and Russian languages. - -# Evaluation -Evaluation on text normalization data, e.g. [Google text normalization dataset](https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish). - -``` -python run_evaluate.py --input=./en_with_types/output-00001-of-00100 \ - [--cat CATEGORY] [--filter] -``` \ No newline at end of file +Tutorial on how to customize the underlying gramamrs: [WFST_Tutorial.ipynb](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/WFST_Tutorial.ipynb) \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index fa0a3e29d35f..6b2c84e571b4 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -16,7 +16,12 @@ from time import perf_counter from typing import List -from nemo_text_processing.text_normalization.data_loader_utils import check_installation, get_installation_msg +from nemo_text_processing.text_normalization.data_loader_utils import ( + check_installation, + get_installation_msg, + load_file, + write_file, +) from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.token_parser import TokenParser @@ -102,7 +107,10 @@ def inverse_normalize(self, text: str, verbose: bool) -> str: def parse_args(): parser = ArgumentParser() - parser.add_argument("input_string", help="input string", type=str) + input = parser.add_mutually_exclusive_group() + input.add_argument("--text", dest="input_string", help="input string", type=str) + input.add_argument("--input_file", dest="input_file", help="input file path", type=str) + parser.add_argument('--output_file', dest="output_file", help="output file path", type=str) parser.add_argument( "--language", help="language", choices=['en', 'de', 'es', 'ru', 'fr', 'vi'], default="en", type=str ) @@ -124,6 +132,17 @@ def parse_args(): lang=args.language, cache_dir=args.cache_dir, overwrite_cache=args.overwrite_cache ) print(f'Time to generate graph: {round(perf_counter() - start_time, 2)} sec') - start_time = perf_counter() - print(inverse_normalizer.inverse_normalize(args.input_string, verbose=args.verbose)) - print(f'Execution time: {round(perf_counter() - start_time, 2)} sec') + + if args.input_string: + print(inverse_normalizer.inverse_normalize(args.input_string, verbose=args.verbose)) + elif args.input_file: + print("Loading data: " + args.input_file) + data = load_file(args.input_file) + + print("- Data: " + str(len(data)) + " sentences") + prediction = inverse_normalizer.inverse_normalize_list(data, verbose=args.verbose) + if args.output_file: + write_file(args.output_file, prediction) + print(f"- Denormalized. Writing out to {args.output_file}") + else: + print(prediction) diff --git a/nemo_text_processing/inverse_text_normalization/run_predict.py b/nemo_text_processing/inverse_text_normalization/run_predict.py deleted file mode 100644 index 811f8da11523..000000000000 --- a/nemo_text_processing/inverse_text_normalization/run_predict.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from argparse import ArgumentParser -from typing import List - -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer - - -''' -Runs inverse text normalization on text data -''' - - -def load_file(file_path: str) -> List[str]: - """ - Load given text file into list of string. - - Args: - file_path: file path - - Returns: flat list of string - """ - res = [] - with open(file_path, 'r', encoding='utf-8') as fp: - for line in fp: - res.append(line) - return res - - -def write_file(file_path: str, data: List[str]): - """ - Writes out list of string to file. - - Args: - file_path: file path - data: list of string - - """ - with open(file_path, 'w', encoding='utf-8') as fp: - for line in data: - fp.write(line + '\n') - - -def parse_args(): - parser = ArgumentParser() - parser.add_argument("--input", help="input file path", required=True, type=str) - parser.add_argument( - "--language", help="language", choices=['en', 'de', 'es', 'ru', 'fr', 'vi'], default="en", type=str - ) - parser.add_argument("--output", help="output file path", required=True, type=str) - parser.add_argument("--verbose", help="print denormalization info. For debugging", action='store_true') - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - file_path = args.input - inverse_normalizer = InverseNormalizer(lang=args.language) - - print("Loading data: " + file_path) - data = load_file(file_path) - - print("- Data: " + str(len(data)) + " sentences") - inverse_normalizer_prediction = inverse_normalizer.inverse_normalize_list(data, verbose=args.verbose) - write_file(args.output, inverse_normalizer_prediction) - print(f"- Denormalized. Writing out to {args.output}") diff --git a/nemo_text_processing/text_normalization/README.md b/nemo_text_processing/text_normalization/README.md index 83018cac03c8..515572f27b76 100644 --- a/nemo_text_processing/text_normalization/README.md +++ b/nemo_text_processing/text_normalization/README.md @@ -1,53 +1,10 @@ # Text Normalization -NeMo Text Normalization converts text from written form into its verbalized form. It is used as a preprocessing step before Text to Speech (TTS). It could also be used for preprocessing Automatic Speech Recognition (ASR) training transcripts. +Text Normalization is part of NeMo's `nemo_text_processing` - a Python package that is installed with the `nemo_toolkit`. +It converts text from written form into its verbalized form, e.g. "123" -> "one hundred twenty three". -For example, `123 kg` -> `one hundred twenty three kilograms` +See [NeMo documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/wfst_text_normalization.html) for details. -# Documentation +Tutorial with overview of the package capabilities: [Text_(Inverse)_Normalization.ipynb](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Text_(Inverse)_Normalization.ipynb) -[TN documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/wfst_text_normalization.html). - -[TN/INT tutorials: NeMo/tutorials/text_processing](https://github.com/NVIDIA/NeMo/tree/stable/tutorials/text_processing). - -# Installation -`bash ../setup.sh` - -# Integrate TN to a text processing pipeline - -``` -# import WFST-based TN module -from nemo_text_processing.text_normalization.normalize import Normalizer - -# initialize normalizer -normalizer = Normalizer(input_case="cased", lang="en") - -# try normalizer on a few examples -print(normalizer.normalize("123")) -# >>> one hundred twenty three -print(normalizer.normalize_list(["at 10:00", "it weights 10kg."], punct_post_process=True)) -# >>> ["at ten o'clock", 'it weights ten kilograms.'] -``` - -# Prediction - -``` -# run prediction on -python run_predict.py --input= --output= --lang= \ - [--input_case=] - -# single input prediction -python normalize.py --lang= \ - [--verbose] [--overwrite_cache] [--cache_dir=] [--input_case=] -``` - -# Evaluation - -Evaluation on text normalization data, e.g. [Google text normalization dataset](https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish). - - -``` python run_evaluate.py --input=./en_with_types/output-00001-of-00100 [--cat CATEGORY] ``` - -# Audio-based normalization - -See [documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/wfst_text_normalization.html#audio-based-text-normalization) for more details. \ No newline at end of file +Tutorial on how to customize the underlying gramamrs: [WFST_Tutorial.ipynb](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/WFST_Tutorial.ipynb) \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/data_loader_utils.py b/nemo_text_processing/text_normalization/data_loader_utils.py index 887ca75cc5eb..4a83e1f49579 100644 --- a/nemo_text_processing/text_normalization/data_loader_utils.py +++ b/nemo_text_processing/text_normalization/data_loader_utils.py @@ -41,7 +41,7 @@ ] -def load_kaggle_text_norm_file(file_path: str) -> List[Instance]: +def _load_kaggle_text_norm_file(file_path: str) -> List[Instance]: """ https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish Loads text file in the Kaggle Google text normalization file format: \t\t<`self` if trivial class or normalized text> @@ -81,7 +81,7 @@ def load_kaggle_text_norm_file(file_path: str) -> List[Instance]: return res -def load_files(file_paths: List[str], load_func=load_kaggle_text_norm_file) -> List[Instance]: +def load_files(file_paths: List[str], load_func=_load_kaggle_text_norm_file) -> List[Instance]: """ Load given list of text files using the `load_func` function. @@ -254,3 +254,33 @@ def check_installation(): def get_installation_msg(): msg = "`pynini` is not installed ! \n Please run the `nemo_text_processing/setup.sh` script prior to usage of this toolkit." return msg + + +def load_file(file_path: str) -> List[str]: + """ + Loads given text file with separate lines into list of string. + + Args: + file_path: file path + + Returns: flat list of string + """ + res = [] + with open(file_path, 'r') as fp: + for line in fp: + res.append(line) + return res + + +def write_file(file_path: str, data: List[str]): + """ + Writes out list of string to file. + + Args: + file_path: file path + data: list of string + + """ + with open(file_path, 'w') as fp: + for line in data: + fp.write(line + '\n') diff --git a/nemo_text_processing/text_normalization/en/taggers/range.py b/nemo_text_processing/text_normalization/en/taggers/range.py index 721efd5bc571..42ccd5faf4d0 100644 --- a/nemo_text_processing/text_normalization/en/taggers/range.py +++ b/nemo_text_processing/text_normalization/en/taggers/range.py @@ -13,12 +13,7 @@ # limitations under the License. -from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_DIGIT, - GraphFst, - convert_space, - delete_extra_space, -) +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, convert_space try: import pynini @@ -50,7 +45,7 @@ def __init__( delete_space = pynini.closure(pynutil.delete(" "), 0, 1) cardinal = cardinal.graph_with_and - approx = pynini.cross("~", "approximately") + delete_extra_space + approx = pynini.cross("~", "approximately") # TIME time_graph = time + delete_space + pynini.cross("-", " to ") + delete_space + time diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index a8b583a6e06c..84bb00af1f0c 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -20,7 +20,12 @@ from math import factorial from typing import Dict, List, Union -from nemo_text_processing.text_normalization.data_loader_utils import get_installation_msg, pre_process +from nemo_text_processing.text_normalization.data_loader_utils import ( + get_installation_msg, + load_file, + pre_process, + write_file, +) from nemo_text_processing.text_normalization.token_parser import PRESERVE_ORDER_KEY, TokenParser from tqdm import tqdm @@ -111,7 +116,9 @@ def __init__( self.processor = None print("NeMo NLP is not available. Moses de-tokenization will be skipped.") - def normalize_list(self, texts: List[str], verbose=False, punct_post_process: bool = False) -> List[str]: + def normalize_list( + self, texts: List[str], verbose=False, punct_pre_process: bool = False, punct_post_process: bool = False + ) -> List[str]: """ NeMo text normalizer @@ -124,7 +131,9 @@ def normalize_list(self, texts: List[str], verbose=False, punct_post_process: bo res = [] for input in tqdm(texts): try: - text = self.normalize(input, verbose=verbose, punct_post_process=punct_post_process) + text = self.normalize( + input, verbose=verbose, punct_pre_process=punct_pre_process, punct_post_process=punct_post_process + ) except: print(input) raise Exception @@ -367,7 +376,10 @@ def select_verbalizer(self, lattice: 'pynini.FstLike') -> str: def parse_args(): parser = ArgumentParser() - parser.add_argument("input_string", help="input string", type=str) + input = parser.add_mutually_exclusive_group() + input.add_argument("--text", dest="input_string", help="input string", type=str) + input.add_argument("--input_file", dest="input_file", help="input file path", type=str) + parser.add_argument('--output_file', dest="output_file", help="output file path", type=str) parser.add_argument("--language", help="language", choices=["en", "de", "es"], default="en", type=str) parser.add_argument( "--input_case", help="input capitalization", choices=["lower_cased", "cased"], default="cased", type=str @@ -393,6 +405,7 @@ def parse_args(): if __name__ == "__main__": args = parse_args() whitelist = os.path.abspath(args.whitelist) if args.whitelist else None + normalizer = Normalizer( input_case=args.input_case, cache_dir=args.cache_dir, @@ -400,11 +413,28 @@ def parse_args(): whitelist=whitelist, lang=args.language, ) - print( - normalizer.normalize( - args.input_string, + if args.input_string: + print( + normalizer.normalize( + args.input_string, + verbose=args.verbose, + punct_pre_process=args.punct_pre_process, + punct_post_process=args.punct_post_process, + ) + ) + elif args.input_file: + print("Loading data: " + args.input_file) + data = load_file(args.input_file) + + print("- Data: " + str(len(data)) + " sentences") + normalizer_prediction = normalizer.normalize_list( + data, verbose=args.verbose, punct_pre_process=args.punct_pre_process, punct_post_process=args.punct_post_process, ) - ) + if args.output_file: + write_file(args.output_file, normalizer_prediction) + print(f"- Normalized. Writing out to {args.output_file}") + else: + print(normalizer_prediction) diff --git a/nemo_text_processing/text_normalization/run_predict.py b/nemo_text_processing/text_normalization/run_predict.py deleted file mode 100644 index 2d99e7a088d1..000000000000 --- a/nemo_text_processing/text_normalization/run_predict.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from argparse import ArgumentParser -from typing import List - -from nemo_text_processing.text_normalization.normalize import Normalizer - - -''' -Runs normalization prediction on text data -''' - - -def load_file(file_path: str) -> List[str]: - """ - Load given text file into list of string. - - Args: - file_path: file path - - Returns: flat list of string - """ - res = [] - with open(file_path, 'r') as fp: - for line in fp: - res.append(line) - return res - - -def write_file(file_path: str, data: List[str]): - """ - Writes out list of string to file. - - Args: - file_path: file path - data: list of string - - """ - with open(file_path, 'w') as fp: - for line in data: - fp.write(line + '\n') - - -def parse_args(): - parser = ArgumentParser() - parser.add_argument("--input", help="input file path", required=True, type=str) - parser.add_argument("--language", help="language", choices=['en'], default="en", type=str) - parser.add_argument("--output", help="output file path", required=True, type=str) - parser.add_argument( - "--input_case", help="input capitalization", choices=["lower_cased", "cased"], default="cased", type=str - ) - parser.add_argument("--verbose", help="print meta info for debugging", action='store_true') - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - file_path = args.input - normalizer = Normalizer(input_case=args.input_case, lang=args.language) - - print("Loading data: " + file_path) - data = load_file(file_path) - - print("- Data: " + str(len(data)) + " sentences") - normalizer_prediction = normalizer.normalize_list(data, verbose=args.verbose) - write_file(args.output, normalizer_prediction) - print(f"- Normalized. Writing out to {args.output}") diff --git a/requirements/requirements_text_processing.txt b/requirements/requirements_nemo_text_processing.txt similarity index 100% rename from requirements/requirements_text_processing.txt rename to requirements/requirements_nemo_text_processing.txt diff --git a/setup.py b/setup.py index 5029b788f327..5514c7057250 100644 --- a/setup.py +++ b/setup.py @@ -80,7 +80,7 @@ def req_file(filename, folder="requirements"): # User packages 'test': req_file("requirements_test.txt"), # NeMo Tools - 'text_processing': req_file("requirements_text_processing.txt"), + 'nemo_text_processing': req_file("requirements_nemo_text_processing.txt"), # Torch Packages # 'torch_tts': req_file("requirements_torch_tts.txt"), ## Removed in 1.7.0 # Lightning Collections Packages @@ -96,7 +96,7 @@ def req_file(filename, folder="requirements"): extras_require['all'] = list(chain(extras_require.values())) # Add lightning requirements as needed -extras_require['common'] = list(chain([extras_require['common'], extras_require['text_processing']])) +extras_require['common'] = list(chain([extras_require['common'], extras_require['nemo_text_processing']])) extras_require['test'] = list(chain([extras_require['tts'], extras_require['core'], extras_require['common']])) extras_require['asr'] = list(chain([extras_require['asr'], extras_require['core'], extras_require['common']])) extras_require['cv'] = list(chain([extras_require['cv'], extras_require['core'], extras_require['common']])) diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_address.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_address.txt index dad2d181b1cd..20729bc3b9c7 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_address.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_address.txt @@ -5,4 +5,5 @@ 1211 E Arques Ave~twelve eleven East Arques Avenue 708 N 1st St, San City~seven zero eight North first Street, San City 12 S 1st st~twelve South first Street -1990 for the Ata ST~nineteen ninety for the Ata ST \ No newline at end of file +1990 for the Ata ST~nineteen ninety for the Ata ST +Main St.~Main St . \ No newline at end of file diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_serial.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_serial.txt index e3d2f481312d..8e79047406a3 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_serial.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_serial.txt @@ -29,4 +29,5 @@ covid-19~covid-nineteen a 4-kilogram bag~a four-kilogram bag 100-car~one hundred-car 123/261788/2021~one hundred twenty three/two six one seven eight eight/two thousand twenty one -2*8~two asterisk eight \ No newline at end of file +2*8~two asterisk eight +and/or~and slash or \ No newline at end of file diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_time.txt index a340ebcc11ea..c337c6ddf086 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_time.txt @@ -18,3 +18,4 @@ 11:20AM,2:40PM and 10:10PM.~eleven twenty AM , two forty PM and ten ten PM . 2pm-5pm~two PM to five PM 5pm~five PM +11:30...~eleven thirty . . . diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_whitelist.txt index 1e3aeab78e1f..37d8d54d9556 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_whitelist.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_whitelist.txt @@ -1,3 +1,6 @@ Dr. Evil~doctor Evil Mrs. Norris~misses Norris dr. Evil~dr . Evil +DNA is~DNA is +C. S. Lewis~CS Lewis +tv~TV diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_word.txt index f64eb54b93dc..d74a902a7729 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_word.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_word.txt @@ -40,3 +40,5 @@ $ and 5% or %~dollar and five percent or percent sign (1)Hello~( one ) Hello !1~! one 1!hello~one ! hello +love him while we may,~love him while we may , +mar~mar diff --git a/tutorials/text_processing/Inverse_Text_Normalization.ipynb b/tutorials/text_processing/Inverse_Text_Normalization.ipynb deleted file mode 100755 index 6883ddbeb95e..000000000000 --- a/tutorials/text_processing/Inverse_Text_Normalization.ipynb +++ /dev/null @@ -1,515 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 5, - "metadata": { - "colab": { - "name": "Inverse_Text_Normalization.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.8" - } - }, - "cells": [ - { - "cell_type": "code", - "metadata": { - "id": "U1GACXvL5GhV" - }, - "source": [ - "if 'google.colab' in str(get_ipython()):\n", - " !pip install -q condacolab\n", - " import condacolab\n", - " condacolab.install()" - ], - "id": "U1GACXvL5GhV", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "YxVLI-f97Kxl" - }, - "source": [ - "\"\"\"\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "\"\"\"\n", - "\n", - "BRANCH = 'r1.9.0'" - ], - "id": "YxVLI-f97Kxl", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "TcWLxxXC7Jgx" - }, - "source": [ - "\n", - "# If you're using Google Colab and not running locally, run this cell.\n", - "# install NeMo\n", - "if 'google.colab' in str(get_ipython()):\n", - " !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" - ], - "id": "TcWLxxXC7Jgx", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "4nf8sui349co" - }, - "source": [ - "if 'google.colab' in str(get_ipython()):\n", - " !conda install -c conda-forge pynini=2.1.3\n", - " ! mkdir images\n", - " ! wget https://github.com/NVIDIA/NeMo/blob/$BRANCH/tutorials/text_processing/images/deployment.png -O images/deployment.png\n", - " ! wget https://github.com/NVIDIA/NeMo/blob/$BRANCH/tutorials/text_processing/images/pipeline.png -O images/pipeline.png" - ], - "id": "4nf8sui349co", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "homeless-richardson" - }, - "source": [ - "import os\n", - "import wget\n", - "import pynini\n", - "import nemo_text_processing" - ], - "id": "homeless-richardson", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "interracial-listing" - }, - "source": [ - "# Task Description\n", - "\n", - "Inverse text normalization (ITN) is a part of the Automatic Speech Recognition (ASR) post-processing pipeline. \n", - "\n", - "ITN is the task of converting the raw spoken output of the ASR model into its written form to improve the text readability. For example, `in nineteen seventy five` should be changed to `in 1975` and `one hundred and twenty three dollars` to `$123`." - ], - "id": "interracial-listing" - }, - { - "cell_type": "markdown", - "metadata": { - "id": "desirable-words" - }, - "source": [ - "# NeMo Inverse Text Normalization\n", - "\n", - "NeMo ITN is based on weighted finite-state\n", - "transducer (WFST) grammars. The tool uses [`Pynini`](https://github.com/kylebgorman/pynini) to construct WFSTs, and the created grammars can be exported and integrated into [`Sparrowhawk`](https://github.com/google/sparrowhawk) (an open-source version of [The Kestrel TTS text normalization system](https://www.cambridge.org/core/journals/natural-language-engineering/article/abs/kestrel-tts-text-normalization-system/F0C18A3F596B75D83B75C479E23795DA)) for production. The NeMo ITN tool can be seen as a Python extension of `Sparrowhawk`. \n", - "\n", - "Currently, NeMo ITN provides support for English and the following semiotic classes from the [Google Text normalization dataset](https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish):\n", - "DATE, CARDINAL, MEASURE, DECIMAL, ORDINAL, MONEY, TIME, PLAIN. \n", - "We additionally added the class `WHITELIST` for all whitelisted tokens whose verbalizations are directly looked up from a user-defined list.\n", - "\n", - "The toolkit is modular, easily extendable, and can be adapted to other languages and tasks like [text normalization](https://github.com/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Text_Normalization.ipynb). The Python environment enables an easy combination of text covering grammars with NNs. \n", - "\n", - "The rule-based system is divided into a classifier and a verbalizer following [Google's Kestrel](https://www.researchgate.net/profile/Richard_Sproat/publication/277932107_The_Kestrel_TTS_text_normalization_system/links/57308b1108aeaae23f5cc8c4/The-Kestrel-TTS-text-normalization-system.pdf) design: the classifier is responsible for detecting and classifying semiotic classes in the underlying text, the verbalizer the verbalizes the detected text segment. \n", - "\n", - "The overall NeMo ITN pipeline from development in `Pynini` to deployment in `Sparrowhawk` is shown below:\n", - "![alt text](images/deployment.png \"Inverse Text Normalization Pipeline\")" - ], - "id": "desirable-words" - }, - { - "cell_type": "markdown", - "metadata": { - "id": "military-radius" - }, - "source": [ - "# Quick Start\n", - "\n", - "## Add ITN to your Python ASR post-processing workflow\n", - "\n", - "ITN is a part of the `nemo_text_processing` package which is installed with `nemo_toolkit`. Installation instructions could be found [here](https://github.com/NVIDIA/NeMo/tree/main/README.rst)." - ], - "id": "military-radius" - }, - { - "cell_type": "code", - "metadata": { - "id": "limiting-genesis" - }, - "source": [ - "from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer\n", - "\n", - "inverse_normalizer = InverseNormalizer(lang='en')\n", - "\n", - "raw_text = \"we paid one hundred and twenty three dollars for this desk, and this.\"\n", - "inverse_normalizer.inverse_normalize(raw_text, verbose=False)" - ], - "id": "limiting-genesis", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "downtown-inventory" - }, - "source": [ - "In the above cell, `one hundred and twenty three dollars` would be converted to `$123`, and the rest of the words remain the same.\n", - "\n", - "## Run Inverse Text Normalization on an input from a file\n", - "\n", - "Use `run_predict.py` to convert a spoken text from a file `INPUT_FILE` to a written format and save the output to `OUTPUT_FILE`. Under the hood, `run_predict.py` is calling `inverse_normalize()` (see the above section)." - ], - "id": "downtown-inventory" - }, - { - "cell_type": "code", - "metadata": { - "id": "streaming-butterfly" - }, - "source": [ - "# If you're running the notebook locally, update the NEMO_TEXT_PROCESSING_PATH below\n", - "# In Colab, a few required scripts will be downloaded from NeMo github\n", - "\n", - "NEMO_TOOLS_PATH = '/nemo_text_processing/inverse_text_normalization'\n", - "DATA_DIR = 'data_dir'\n", - "os.makedirs(DATA_DIR, exist_ok=True)\n", - "\n", - "if 'google.colab' in str(get_ipython()):\n", - " NEMO_TOOLS_PATH = '.'\n", - "\n", - " required_files = ['run_predict.py',\n", - " 'run_evaluate.py']\n", - " for file in required_files:\n", - " if not os.path.exists(file):\n", - " file_path = 'https://raw.githubusercontent.com/NVIDIA/NeMo/' + BRANCH + '/nemo_text_processing/inverse_text_normalization/' + file\n", - " print(file_path)\n", - " wget.download(file_path)\n", - "elif not os.path.exists(NEMO_TOOLS_PATH):\n", - " raise ValueError(f'update path to NeMo root directory')\n", - "\n", - "INPUT_FILE = f'{DATA_DIR}/test.txt'\n", - "OUTPUT_FILE = f'{DATA_DIR}/test_itn.txt'\n", - "\n", - "! echo \"on march second twenty twenty\" > $DATA_DIR/test.txt\n", - "! python $NEMO_TOOLS_PATH/run_predict.py --input=$INPUT_FILE --output=$OUTPUT_FILE --language='en'" - ], - "id": "streaming-butterfly", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "cordless-louisiana" - }, - "source": [ - "# check that the raw text was indeed converted to the written form\n", - "! cat $OUTPUT_FILE" - ], - "id": "cordless-louisiana", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "italic-parish" - }, - "source": [ - "## Run evaluation\n", - "\n", - "[Google Text normalization dataset](https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish) consists of 1.1 billion words of English text from Wikipedia, divided across 100 files. The normalized text is obtained with [The Kestrel TTS text normalization system](https://www.cambridge.org/core/journals/natural-language-engineering/article/abs/kestrel-tts-text-normalization-system/F0C18A3F596B75D83B75C479E23795DA)).\n", - "\n", - "Although a large fraction of this dataset can be reused for ITN by swapping input with output, the dataset is not bijective. \n", - "\n", - "For example: `1,000 -> one thousand`, `1000 -> one thousand`, `3:00pm -> three p m`, `3 pm -> three p m` are valid data samples for normalization but the inverse does not hold for ITN. \n", - "\n", - "We used regex rules to disambiguate samples where possible, see `nemo_text_processing/inverse_text_normalization/clean_eval_data.py`.\n", - "\n", - "To run evaluation, the input file should follow the Google Text normalization dataset format. That is, every line of the file needs to have the format `\\t\\t` if it's trivial class or `\\t\\t` in case of a semiotic class.\n", - "\n", - "Example evaluation run: \n", - "\n", - "`python run_evaluate.py \\\n", - " --input=./en_with_types/output-00001-of-00100 \\\n", - " [--language LANGUAGE] \\\n", - " [--cat CATEGORY] \\\n", - " [--filter]`\n", - " \n", - " \n", - "Use `--cat` to specify a `CATEGORY` to run evaluation on (all other categories are going to be excluded from evaluation). With the option `--filter`, the provided data will be cleaned to avoid disambiguates (use `clean_eval_data.py` to clean up the data upfront)." - ], - "id": "italic-parish" - }, - { - "cell_type": "code", - "metadata": { - "id": "intimate-astronomy" - }, - "source": [ - "eval_text = \"\"\"PLAIN\\ton\\t\n", - "DATE\\t22 july 2012\\tthe twenty second of july twenty twelve\n", - "PLAIN\\tthey\\t\n", - "PLAIN\\tworked\\t\n", - "PLAIN\\tuntil\\t\n", - "TIME\\t12:00\\ttwelve o'clock\n", - "\\t\n", - "\"\"\"\n", - "\n", - "INPUT_FILE_EVAL = f'{DATA_DIR}/test_eval.txt'\n", - "\n", - "with open(INPUT_FILE_EVAL, 'w') as f:\n", - " f.write(eval_text)\n", - "! cat $INPUT_FILE_EVAL" - ], - "id": "intimate-astronomy", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "corporate-contest" - }, - "source": [ - "! python $NEMO_TOOLS_PATH/run_evaluate.py --input=$INPUT_FILE_EVAL --language='en'" - ], - "id": "corporate-contest", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "raised-exemption" - }, - "source": [ - "`run_evaluate.py` call will output both **sentence level** and **token level** accuracies. \n", - "For our example, the expected output is the following:\n", - "\n", - "```\n", - "Loading training data: data_dir/test_eval.txt\n", - "Sentence level evaluation...\n", - "- Data: 1 sentences\n", - "100% 1/1 [00:00<00:00, 58.42it/s]\n", - "- Denormalized. Evaluating...\n", - "- Accuracy: 1.0\n", - "Token level evaluation...\n", - "- Token type: PLAIN\n", - " - Data: 4 tokens\n", - "100% 4/4 [00:00<00:00, 504.73it/s]\n", - " - Denormalized. Evaluating...\n", - " - Accuracy: 1.0\n", - "- Token type: DATE\n", - " - Data: 1 tokens\n", - "100% 1/1 [00:00<00:00, 118.95it/s]\n", - " - Denormalized. Evaluating...\n", - " - Accuracy: 1.0\n", - "- Token type: TIME\n", - " - Data: 1 tokens\n", - "100% 1/1 [00:00<00:00, 230.44it/s]\n", - " - Denormalized. Evaluating...\n", - " - Accuracy: 1.0\n", - "- Accuracy: 1.0\n", - " - Total: 6 \n", - "\n", - "Class | Num Tokens | Denormalization\n", - "sent level | 1 | 1.0 \n", - "PLAIN | 4 | 1.0 \n", - "DATE | 1 | 1.0 \n", - "CARDINAL | 0 | 0 \n", - "LETTERS | 0 | 0 \n", - "VERBATIM | 0 | 0 \n", - "MEASURE | 0 | 0 \n", - "DECIMAL | 0 | 0 \n", - "ORDINAL | 0 | 0 \n", - "DIGIT | 0 | 0 \n", - "MONEY | 0 | 0 \n", - "TELEPHONE | 0 | 0 \n", - "ELECTRONIC | 0 | 0 \n", - "FRACTION | 0 | 0 \n", - "TIME | 1 | 1.0 \n", - "ADDRESS | 0 | 0 \n", - "```" - ], - "id": "raised-exemption" - }, - { - "cell_type": "markdown", - "metadata": { - "id": "imported-literacy" - }, - "source": [ - "# C++ deployment\n", - "\n", - "The instructions on how to export `Pynini` grammars and to run them with `Sparrowhawk`, could be found at [NeMo/tools/text_processing_deployment](https://github.com/NVIDIA/NeMo/tree/main/tools/text_processing_deployment)." - ], - "id": "imported-literacy" - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bronze-nerve" - }, - "source": [ - "# WFST and Common Pynini Operations\n", - "\n", - "Finite-state acceptor (or FSA) is a finite state automaton that has a finite number of states and no output. FSA either accepts (when the matching patter is found) or rejects a string (no match is found). " - ], - "id": "bronze-nerve" - }, - { - "cell_type": "code", - "metadata": { - "id": "heavy-distance" - }, - "source": [ - "print([byte for byte in bytes('fst', 'utf-8')])\n", - "\n", - "# create an acceptor from a string\n", - "pynini.accep('fst')" - ], - "id": "heavy-distance", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "brave-avatar" - }, - "source": [ - "Here `0` - is a start note, `1` and `2` are the accept nodes, while `3` is a finite state.\n", - "By default (token_type=\"byte\", `Pynini` interprets the string as a sequence of bytes, assigning one byte per arc. \n", - "\n", - "A finite state transducer (FST) not only matches the pattern but also produces output according to the defined transitions." - ], - "id": "brave-avatar" - }, - { - "cell_type": "code", - "metadata": { - "id": "paperback-female" - }, - "source": [ - "# create an FST\n", - "pynini.cross('fst', 'FST')" - ], - "id": "paperback-female", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "blond-hypothetical" - }, - "source": [ - "Pynini supports the following operations:\n", - "\n", - "- `closure` - Computes concatenative closure.\n", - "- `compose` - Constructively composes two FSTs.\n", - "- `concat` - Computes the concatenation (product) of two FSTs.\n", - "- `difference` - Constructively computes the difference of two FSTs.\n", - "- `invert` - Inverts the FST's transduction.\n", - "- `optimize` - Performs a generic optimization of the FST.\n", - "- `project` - Converts the FST to an acceptor using input or output labels.\n", - "- `shortestpath` - Construct an FST containing the shortest path(s) in the input FST.\n", - "- `union`- Computes the union (sum) of two or more FSTs.\n", - "\n", - "\n", - "The list of most commonly used `Pynini` operations could be found [https://github.com/kylebgorman/pynini/blob/master/CHEATSHEET](https://github.com/kylebgorman/pynini/blob/master/CHEATSHEET). \n", - "\n", - "Pynini examples could be found at [https://github.com/kylebgorman/pynini/tree/master/pynini/examples](https://github.com/kylebgorman/pynini/tree/master/pynini/examples).\n", - "Use `help()` to explore the functionality. For example:" - ], - "id": "blond-hypothetical" - }, - { - "cell_type": "code", - "metadata": { - "id": "arctic-firewall" - }, - "source": [ - "help(pynini.union)" - ], - "id": "arctic-firewall", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "continued-optimum" - }, - "source": [ - "# NeMo ITN API" - ], - "id": "continued-optimum" - }, - { - "cell_type": "markdown", - "metadata": { - "id": "twenty-terrorist" - }, - "source": [ - "NeMo ITN defines the following APIs that are called in sequence:\n", - "\n", - "- `find_tags() + select_tag()` - creates a linear automaton from the input string and composes it with the final classification WFST, which transduces numbers and inserts semantic tags. \n", - "- `parse()` - parses the tagged string into a list of key-value items representing the different semiotic tokens.\n", - "- `generate_permutations()` - takes the parsed tokens and generates string serializations with different reorderings of the key-value items. This is important since WFSTs can only process input linearly, but the word order can change from spoken to written form (e.g., `three dollars -> $3`). \n", - "- `find_verbalizer() + select_verbalizer` - takes the intermediate string representation and composes it with the final verbalization WFST, which removes the tags and returns the written form. \n", - "\n", - "![alt text](images/pipeline.png \"Inverse Text Normalization Pipeline\")" - ], - "id": "twenty-terrorist" - }, - { - "cell_type": "markdown", - "metadata": { - "id": "twenty-charles" - }, - "source": [ - "# References and Further Reading:\n", - "\n", - "\n", - "- [Zhang, Yang, Bakhturina, Evelina, Gorman, Kyle and Ginsburg, Boris. \"NeMo Inverse Text Normalization: From Development To Production.\" (2021)](https://arxiv.org/abs/2104.05055)\n", - "- [Ebden, Peter, and Richard Sproat. \"The Kestrel TTS text normalization system.\" Natural Language Engineering 21.3 (2015): 333.](https://www.cambridge.org/core/journals/natural-language-engineering/article/abs/kestrel-tts-text-normalization-system/F0C18A3F596B75D83B75C479E23795DA)\n", - "- [Gorman, Kyle. \"Pynini: A Python library for weighted finite-state grammar compilation.\" Proceedings of the SIGFSM Workshop on Statistical NLP and Weighted Automata. 2016.](https://www.aclweb.org/anthology/W16-2409.pdf)\n", - "- [Mohri, Mehryar, Fernando Pereira, and Michael Riley. \"Weighted finite-state transducers in speech recognition.\" Computer Speech & Language 16.1 (2002): 69-88.](https://cs.nyu.edu/~mohri/postscript/csl01.pdf)" - ], - "id": "twenty-charles" - } - ] -} \ No newline at end of file diff --git a/tutorials/text_processing/README.md b/tutorials/text_processing/README.md deleted file mode 100644 index 07e4ac0ea5b5..000000000000 --- a/tutorials/text_processing/README.md +++ /dev/null @@ -1,24 +0,0 @@ -# NeMo Text Processing Tutorials - -The NeMo Text Processing module provides support for both Text Normalization (TN) and -Inverse Text Normalization (ITN) in order to aid upstream and downstream text processing. -The included tutorials are intended to help you quickly become familiar with the interface -of the module, as well as guiding you in creating and deploying your own grammars for individual -text processing needs. - -If you wish to learn more about how to use NeMo's for Text Normalization tasks (e.g. conversion -of symbolic strings to verbal form - such as `15` -> "fifteen"), please see the [`Text Normalization`](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Text_Normalization.ipynb) -tutorial. - -If you wish to learn more about Inverse Text Normalization - the inverse task of converting -from verbalized strings to symbolic written form, as may be encountered in downstream ASR - -consult the [`Inverse Text Normalization`](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Inverse_Text_Normalization.ipynb) tutorial. - -For those curious about constructing grammars tailored to specific languages and use cases, -you may be interested in working through the [`WFST Tutorial`](https://github.com/NVIDIA/NeMo/blob/stable/tutorials/text_processing/WFST_Tutorial.ipynb), which goes through NeMo's Normalization -process in detail. - -As NeMo Text Processing utilizes Weighted Finite State Transducer (WFST) graphs to construct its -grammars, a working knowledge of [Finite State Automata](https://en.wikipedia.org/wiki/Finite-state_machine) (FSA) and/or regular languages is suggested. -Further, we recommend becoming functionally familiar with the [`pynini` library](https://www.openfst.org/twiki/bin/view/GRM/Pynini) - which functions -as the backend for graph construction - and [Sparrowhawk](https://github.com/google/sparrowhawk) - which NeMo utilizes for grammar deployment. \ No newline at end of file diff --git a/tutorials/text_processing/Text_(Inverse)_Normalization.ipynb b/tutorials/text_processing/Text_(Inverse)_Normalization.ipynb new file mode 100755 index 000000000000..2f9dc0c7f8aa --- /dev/null +++ b/tutorials/text_processing/Text_(Inverse)_Normalization.ipynb @@ -0,0 +1,444 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "a5fA5qAm5Afg" + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "You can run either this notebook locally or on Google Colab.\n", + "\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Optional: Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> **_NOTE:_** Find the official NeMo documentation at \n", + "https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/intro.html " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Overview\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F-IrnmXMTevr" + }, + "source": [ + "A sentence can be split up into semiotic tokens stemming from a varity of classes, where the spoken form differs from the written form. Examples are *dates*, *decimals*, *cardinals*, *measures* etc. The good TN or ITN system will be able to handle a variety of **semiotic classes**." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-IT1Xr9iW2Xr" + }, + "source": [ + "# How to use\n", + "## 1. Installation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Install NeMo, which installs both nemo and nemo_text_processing package\n", + "BRANCH = 'r1.9.0'\n", + "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# try to import of nemo_text_processing an other dependencies\n", + "import nemo_text_processing\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Text Normalization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Bfs7fa9lXDDh" + }, + "outputs": [], + "source": [ + "# create text normalization instance that works on cased input\n", + "from nemo_text_processing.text_normalization.normalize import Normalizer\n", + "normalizer = Normalizer(input_case='cased', lang='en')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# the normalizer class offers the following parameterization. \n", + "print(normalizer.__doc__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> **_NOTE:_** Standard Text Normalization uses `determistic=True`, outputting a single output for a given input string\n", + "\n", + "\n", + "\n", + "#### 2.1 Run TN on input string" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Normalizer.normalize() offers the following parameterization\n", + "print(normalizer.normalize.__doc__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# run normalization on example string input\n", + "written = \"We paid $123 for this desk.\"\n", + "normalized = normalizer.normalize(written, verbose=True, punct_post_process=True)\n", + "print(normalized)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "intermediate semtiotic class information is shown if verbose=True." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### 2.1 Run TN on list of input strings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UD-OuFmEOX3T" + }, + "outputs": [], + "source": [ + "# create temporary data folder and example input file\n", + "DATA_DIR = 'tmp_data_dir'\n", + "os.makedirs(DATA_DIR, exist_ok=True)\n", + "INPUT_FILE = f'{DATA_DIR}/inference.txt'\n", + "! echo -e 'The alarm went off at 10:00a.m. \\nI received $123' > $INPUT_FILE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "d4T0gXHwY3JZ" + }, + "outputs": [], + "source": [ + "# check input file was properly created\n", + "! cat $INPUT_FILE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load input file into 'data' - a list of strings\n", + "data = []\n", + "with open(INPUT_FILE, 'r') as fp:\n", + " for line in fp:\n", + " data.append(line.strip())\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "F5wSJTI8ZFRg" + }, + "outputs": [], + "source": [ + "# run normalization on 'data'\n", + "normalizer.normalize_list(data, punct_post_process=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RMT5lkPYzZHK" + }, + "source": [ + "### 2.2 Evaluate TN on written-normalized text pairs \n", + "\n", + "The evaluation data needs to have the following format:\n", + "\n", + "'on 22 july 2022 they worked until 12:00' and the normalization is represented as " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# example evaluation sentence\n", + "eval_text = \"\"\"PLAIN\\ton\\t\n", + "DATE\\t22 july 2012\\tthe twenty second of july twenty twelve\n", + "PLAIN\\tthey\\t\n", + "PLAIN\\tworked\\t\n", + "PLAIN\\tuntil\\t\n", + "TIME\\t12:00\\ttwelve o'clock\n", + "\\t\n", + "\"\"\"\n", + "EVAL_FILE = f'{DATA_DIR}/eval.txt'\n", + "with open(EVAL_FILE, 'w') as fp:\n", + " fp.write(eval_text)\n", + "! cat $EVAL_FILE" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RMT5lkPYzZHK" + }, + "source": [ + "That is, every sentence is broken into semiotic tokens line by line and concluded by end of sentence token ``. In case of a plain token it's `[SEMIOTIC CLASS] [TAB] [WRITTEN] [TAB] `, otherwise `[SEMIOTIC CLASS] [TAB] [WRITTEN] [TAB] [NORMALIZED]`.\n", + "This format was introduced in [Google Text normalization dataset](https://arxiv.org/abs/1611.00068). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Parse evaluation file into written and normalized sentence pairs\n", + "from nemo_text_processing.text_normalization.data_loader_utils import load_files, training_data_to_sentences\n", + "eval_data = load_files([EVAL_FILE])\n", + "sentences_un_normalized, sentences_normalized, sentences_class_types = training_data_to_sentences(eval_data)\n", + "print(list(zip(sentences_un_normalized, sentences_normalized)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# run prediction\n", + "sentences_prediction = normalizer.normalize_list(sentences_un_normalized)\n", + "print(sentences_prediction)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# measure sentence accuracy\n", + "from nemo_text_processing.text_normalization.data_loader_utils import evaluate\n", + "sentences_accuracy = evaluate(\n", + " preds=sentences_prediction, labels=sentences_normalized, input=sentences_un_normalized\n", + " )\n", + "print(\"- Accuracy: \" + str(sentences_accuracy))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Inverse Text Normalization\n", + "ITN supports equivalent API as TN. Here we are only going to show inverse normalization on input string" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create inverse text normalization instance\n", + "from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer\n", + "inverse_normalizer = InverseNormalizer(lang='en')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# run ITN on example string input\n", + "spoken = \"we paid one hundred twenty three dollars for this desk\"\n", + "un_normalized = inverse_normalizer.inverse_normalize(spoken, verbose=True)\n", + "print(un_normalized)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Audio-based Text Normalization\n", + "Audio-based text normalization uses extended [WFST](https://en.wikipedia.org/wiki/Finite-state_machine) grammars to provide a range of possible normalization options.\n", + "The following example shows the workflow: (Disclaimer: exact values in graphic do not need to be real system's behavior)\n", + "1. text \"627\" is sent to extended TN WFST grammar\n", + "2. grammar output 5 different options of verbalization based on text input alone\n", + "3. in case an audio file is presented we compare the audio transcript with the verbalization options to find out which normalization is correct based on character error rate. The transcript is generated using a pretrained NeMo ASR model. \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following shows an example of how to generate multiple normalization options:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import non-deterministic WFST-based TN module\n", + "from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# initialize normalizer, this may take some time to generate the extended grammars. \n", + "# Thus, we recommend to cache the grammars by specifying a cache directory\n", + "normalizer = NormalizerWithAudio(\n", + " lang=\"en\",\n", + " input_case=\"cased\",\n", + " overwrite_cache=False,\n", + " cache_dir=\"cache_dir\",\n", + " )\n", + "# create up to 10 normalization options\n", + "print(normalizer.normalize(\"123\", n_tagged=10, punct_post_process=True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Parallel execution\n", + "\n", + "`Normalizer.normalize()` as well as `InverseNormalizer.inverse_normalize()` are functions without side effect.\n", + "Thus, if you need to normalize large amounts of input examples, these can be executed in parallel." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ENMDNl9C4TkF" + }, + "source": [ + "# Tutorial on how to customize grammars\n", + "\n", + "https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/WFST_Tutorial.ipynb\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lcvT3P2lQ_GS" + }, + "source": [ + "# References and Further Reading:\n", + "\n", + "\n", + "- [Zhang, Yang, Bakhturina, Evelina, Gorman, Kyle and Ginsburg, Boris. \"NeMo Inverse Text Normalization: From Development To Production.\" (2021)](https://arxiv.org/abs/2104.05055)\n", + "- [Ebden, Peter, and Richard Sproat. \"The Kestrel TTS text normalization system.\" Natural Language Engineering 21.3 (2015): 333.](https://www.cambridge.org/core/journals/natural-language-engineering/article/abs/kestrel-tts-text-normalization-system/F0C18A3F596B75D83B75C479E23795DA)\n", + "- [Gorman, Kyle. \"Pynini: A Python library for weighted finite-state grammar compilation.\" Proceedings of the SIGFSM Workshop on Statistical NLP and Weighted Automata. 2016.](https://www.aclweb.org/anthology/W16-2409.pdf)\n", + "- [Mohri, Mehryar, Fernando Pereira, and Michael Riley. \"Weighted finite-state transducers in speech recognition.\" Computer Speech & Language 16.1 (2002): 69-88.](https://cs.nyu.edu/~mohri/postscript/csl01.pdf)" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [ + "lcvT3P2lQ_GS" + ], + "name": "Text_Normalization_Tutorial.ipynb", + "private_outputs": true, + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/tutorials/text_processing/Text_Normalization.ipynb b/tutorials/text_processing/Text_Normalization.ipynb deleted file mode 100755 index e3273ad1738c..000000000000 --- a/tutorials/text_processing/Text_Normalization.ipynb +++ /dev/null @@ -1,395 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "accelerator": "GPU", - "colab": { - "name": "Text_Normalization_Tutorial.ipynb", - "private_outputs": true, - "provenance": [], - "collapsed_sections": [ - "lcvT3P2lQ_GS" - ], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.8" - } - }, - "cells": [ - { - "cell_type": "code", - "metadata": { - "id": "a5fA5qAm5Afg" - }, - "source": [ - "if 'google.colab' in str(get_ipython()):\n", - " !pip install -q condacolab\n", - " import condacolab\n", - " condacolab.install()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "x0DJqotopcyb", - "collapsed": true - }, - "source": [ - "\"\"\"\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run this cell to set up dependencies.\n", - "\"\"\"\n", - "# If you're using Google Colab and not running locally, run this cell\n", - "\n", - "# install NeMo\n", - "BRANCH = 'r1.9.0'\n", - "if 'google.colab' in str(get_ipython()):\n", - " !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "nYsp3SH24Tj_" - }, - "source": [ - "if 'google.colab' in str(get_ipython()):\n", - " ! conda install -c conda-forge pynini=2.1.3\n", - " ! mkdir images\n", - " ! wget https://github.com/NVIDIA/NeMo/blob/$BRANCH/tutorials/text_processing/images/deployment.png -O images/deployment.png\n", - " ! wget https://github.com/NVIDIA/NeMo/blob/$BRANCH/tutorials/text_processing/images/pipeline.png -O images/pipeline.png" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "CH7yR7cSwPKr" - }, - "source": [ - "import os\n", - "import wget\n", - "import pynini\n", - "import nemo_text_processing\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F-IrnmXMTevr" - }, - "source": [ - "# Task Description\n", - "\n", - "Text normalization (TN) is a part of the Text-To-Speech (TTS) pre-processing pipeline. It could also be used for pre-processing Automatic Speech Recognition (ASR) training transcripts.\n", - "\n", - "TN is the task of converting text in written form to its spoken form to improve TTS. For example, `10:00` should be changed to `ten o'clock` and `10kg` to `ten kilograms`." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xXRARM8XtK_g" - }, - "source": [ - "# NeMo Text Normalization\n", - "\n", - "NeMo TN is based on weighted finite-state\n", - "transducer (WFST) grammars. The tool uses [`Pynini`](https://github.com/kylebgorman/pynini) to construct WFSTs, and the created grammars can be exported and integrated into [`Sparrowhawk`](https://github.com/google/sparrowhawk) (an open-source version of [The Kestrel TTS text normalization system](https://www.cambridge.org/core/journals/natural-language-engineering/article/abs/kestrel-tts-text-normalization-system/F0C18A3F596B75D83B75C479E23795DA)) for production. The NeMo TN tool can be seen as a Python extension of `Sparrowhawk`. \n", - "\n", - "Currently, NeMo TN provides support for English and the following semiotic classes from the [Google Text normalization dataset](https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish):\n", - "DATE, CARDINAL, MEASURE, DECIMAL, ORDINAL, MONEY, TIME, TELEPHONE, ELECTRONIC, PLAIN. We additionally added the class `WHITELIST` for all whitelisted tokens whose verbalizations are directly looked up from a user-defined list.\n", - "\n", - "The toolkit is modular, easily extendable, and can be adapted to other languages and tasks like [inverse text normalization](https://github.com/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Inverse_Text_Normalization.ipynb). The Python environment enables an easy combination of text covering grammars with NNs. \n", - "\n", - "The rule-based system is divided into a classifier and a verbalizer following [Google's Kestrel](https://www.researchgate.net/profile/Richard_Sproat/publication/277932107_The_Kestrel_TTS_text_normalization_system/links/57308b1108aeaae23f5cc8c4/The-Kestrel-TTS-text-normalization-system.pdf) design: the classifier is responsible for detecting and classifying semiotic classes in the underlying text, the verbalizer the verbalizes the detected text segment. \n", - "In the example `The alarm goes off at 10:30 a.m.`, the tagger for TIME detects `10:30 a.m.` as a valid time data with `hour=10`, `minutes=30`, `suffix=a.m.`, the verbalizer then turns this into `ten thirty a m`.\n", - "\n", - "The overall NeMo TN pipeline from development in `Pynini` to deployment in `Sparrowhawk` is shown below (example for ITN):\n", - "![alt text](images/deployment.png \"Inverse Text Normalization Pipeline\")\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-IT1Xr9iW2Xr" - }, - "source": [ - "# Quick Start\n", - "\n", - "## Add TN to your Python TTS pre-processing workflow\n", - "\n", - "TN is a part of the `nemo_text_processing` package which is installed with `nemo_toolkit`. Installation instructions could be found [here](https://github.com/NVIDIA/NeMo/tree/main/README.rst)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Bfs7fa9lXDDh" - }, - "source": [ - "from nemo_text_processing.text_normalization.normalize import Normalizer\n", - "# creates normalizer object that works on lower cased input\n", - "normalizer = Normalizer(input_case='cased', lang='en')\n", - "raw_text = \"We paid $123 for this desk.\"\n", - "normalizer.normalize(raw_text, verbose=False)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "w5sX0SXbXoZp" - }, - "source": [ - "In the above cell, `$123` would be converted to `one hundred twenty three dollars`, and the rest of the words remain the same.\n", - "\n", - "## Run Text Normalization on an input from a file\n", - "\n", - "Use `run_predict.py` to convert a written format from a file `INPUT_FILE` to a spoken text and save the output to `OUTPUT_FILE`. Under the hood, `run_predict.py` is calling `normalize()` (see the above section)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "UD-OuFmEOX3T" - }, - "source": [ - "# If you're running the notebook locally, update the NEMO_TEXT_PROCESSING_PATH below\n", - "# In Colab, a few required scripts will be downloaded from NeMo github\n", - "\n", - "NEMO_TOOLS_PATH = '/nemo_text_processing/text_normalization'\n", - "DATA_DIR = 'data_dir'\n", - "os.makedirs(DATA_DIR, exist_ok=True)\n", - "\n", - "if 'google.colab' in str(get_ipython()):\n", - " NEMO_TOOLS_PATH = '.'\n", - "\n", - " required_files = ['run_predict.py',\n", - " 'run_evaluate.py']\n", - " for file in required_files:\n", - " if not os.path.exists(file):\n", - " file_path = 'https://raw.githubusercontent.com/NVIDIA/NeMo/' + BRANCH + '/nemo_text_processing/text_normalization/' + file\n", - " print(file_path)\n", - " wget.download(file_path)\n", - "elif not os.path.exists(NEMO_TOOLS_PATH):\n", - " raise ValueError(f'update path to NeMo root directory')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "d4T0gXHwY3JZ" - }, - "source": [ - "INPUT_FILE = f'{DATA_DIR}/test.txt'\n", - "OUTPUT_FILE = f'{DATA_DIR}/test_tn.txt'\n", - "\n", - "! echo \"The alarm went off at 10:00.\" > $DATA_DIR/test.txt\n", - "! cat $INPUT_FILE\n", - "! python $NEMO_TOOLS_PATH/run_predict.py --input=$INPUT_FILE --output=$OUTPUT_FILE --language='en'" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "F5wSJTI8ZFRg" - }, - "source": [ - "# check that the raw text was converted to the spoken form\n", - "! cat $OUTPUT_FILE" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RMT5lkPYzZHK" - }, - "source": [ - "## Run evaluation\n", - "\n", - "[Google Text normalization dataset](https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish) consists of 1.1 billion words of English text from Wikipedia, divided across 100 files. The normalized text is obtained with The Kestrel TTS text normalization system).\n", - "\n", - "To run evaluation, the input file should follow the Google Text normalization dataset format. That is, every line of the file needs to have the format `\\t\\t` if it's trivial class or `\\t\\t` in case of a semiotic class.\n", - "\n", - "\n", - "Example evaluation run:\n", - "\n", - "\n", - "`python run_evaluate.py \\\n", - " --input=./en_with_types/output-00001-of-00100 \\\n", - " [--language LANGUAGE] \\\n", - " [--input_case INPUT_CASE] \\\n", - " [--cat CATEGORY]`\n", - "\n", - "Use `--cat` to specify a `CATEGORY` to run evaluation on (all other categories are going to be excluded from evaluation). The option `--input_case` tells the algorithm that the input is either lower cased or cased.\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "u4zjeVVv-UXR" - }, - "source": [ - "eval_text = \"\"\"PLAIN\\ton\\t\n", - "DATE\\t22 july 2012\\tthe twenty second of july twenty twelve\n", - "PLAIN\\tthey\\t\n", - "PLAIN\\tworked\\t\n", - "PLAIN\\tuntil\\t\n", - "TIME\\t12:00\\ttwelve o'clock\n", - "\\t\n", - "\"\"\"\n", - "INPUT_FILE_EVAL = f\"{DATA_DIR}/test_eval.txt\"\n", - "with open(INPUT_FILE_EVAL, 'w') as fp:\n", - " fp.write(eval_text)\n", - "! cat $INPUT_FILE_EVAL" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "G7_5oXpObizP" - }, - "source": [ - "! python $NEMO_TOOLS_PATH/run_evaluate.py --input=$INPUT_FILE_EVAL --language='en'" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bIvKBwRcH_9W" - }, - "source": [ - "`run_evaluate.py` call will output both **sentence level** and **token level** accuracies. \n", - "For our example, the expected output is the following:\n", - "\n", - "```\n", - "Loading training data: data_dir/test_eval.txt\n", - "Sentence level evaluation...\n", - "- Data: 1 sentences\n", - "100% 1/1 [00:00<00:00, 14.24it/s]\n", - "- Normalized. Evaluating...\n", - "- Accuracy: 1.0\n", - "Token level evaluation...\n", - "- Token type: PLAIN\n", - " - Data: 4 tokens\n", - "100% 4/4 [00:00<00:00, 239.56it/s]\n", - " - Denormalized. Evaluating...\n", - " - Accuracy: 1.0\n", - "- Token type: DATE\n", - " - Data: 1 tokens\n", - "100% 1/1 [00:00<00:00, 33.69it/s]\n", - " - Denormalized. Evaluating...\n", - " - Accuracy: 1.0\n", - "- Token type: TIME\n", - " - Data: 1 tokens\n", - "100% 1/1 [00:00<00:00, 94.84it/s]\n", - " - Denormalized. Evaluating...\n", - " - Accuracy: 1.0\n", - "- Accuracy: 1.0\n", - " - Total: 6 \n", - "\n", - " - Total: 6 \n", - "\n", - "Class | Num Tokens | Normalization\n", - "sent level | 1 | 1.0 \n", - "PLAIN | 4 | 1.0 \n", - "DATE | 1 | 1.0 \n", - "CARDINAL | 0 | 0 \n", - "LETTERS | 0 | 0 \n", - "VERBATIM | 0 | 0 \n", - "MEASURE | 0 | 0 \n", - "DECIMAL | 0 | 0 \n", - "ORDINAL | 0 | 0 \n", - "DIGIT | 0 | 0 \n", - "MONEY | 0 | 0 \n", - "TELEPHONE | 0 | 0 \n", - "ELECTRONIC | 0 | 0 \n", - "FRACTION | 0 | 0 \n", - "TIME | 1 | 1.0 \n", - "ADDRESS | 0 | 0 \n", - "\n", - "```\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L85ZaUJ_4TkF" - }, - "source": [ - "# C++ deployment\n", - "\n", - "The instructions on how to export `Pynini` grammars and to run them with `Sparrowhawk`, could be found at [NeMo/tools/text_processing_deployment](https://github.com/NVIDIA/NeMo/tree/main/tools/text_processing_deployment)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ENMDNl9C4TkF" - }, - "source": [ - "# WFST and Common Pynini Operations\n", - "\n", - "See [NeMo Text Inverse Normalization Tutorial](https://github.com/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Inverse_Text_Normalization.ipynb) for details." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lcvT3P2lQ_GS" - }, - "source": [ - "# References and Further Reading:\n", - "\n", - "\n", - "- [Zhang, Yang, Bakhturina, Evelina, Gorman, Kyle and Ginsburg, Boris. \"NeMo Inverse Text Normalization: From Development To Production.\" (2021)](https://arxiv.org/abs/2104.05055)\n", - "- [Ebden, Peter, and Richard Sproat. \"The Kestrel TTS text normalization system.\" Natural Language Engineering 21.3 (2015): 333.](https://www.cambridge.org/core/journals/natural-language-engineering/article/abs/kestrel-tts-text-normalization-system/F0C18A3F596B75D83B75C479E23795DA)\n", - "- [Gorman, Kyle. \"Pynini: A Python library for weighted finite-state grammar compilation.\" Proceedings of the SIGFSM Workshop on Statistical NLP and Weighted Automata. 2016.](https://www.aclweb.org/anthology/W16-2409.pdf)\n", - "- [Mohri, Mehryar, Fernando Pereira, and Michael Riley. \"Weighted finite-state transducers in speech recognition.\" Computer Speech & Language 16.1 (2002): 69-88.](https://cs.nyu.edu/~mohri/postscript/csl01.pdf)" - ] - } - ] -} \ No newline at end of file diff --git a/tutorials/text_processing/WFST_Tutorial.ipynb b/tutorials/text_processing/WFST_Tutorial.ipynb index f714c6a6be54..981a2d5cef5c 100644 --- a/tutorials/text_processing/WFST_Tutorial.ipynb +++ b/tutorials/text_processing/WFST_Tutorial.ipynb @@ -1,7196 +1,7050 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Qq1Hz6CKWdwl", - "outputId": "3d8f5bd6-f10e-431d-9039-eb88164fbb95" - }, - "outputs": [], - "source": [ - "### WARNING: This notebook will not work in a Colab environment. \n", - "\n", - "BRANCH= 'r1.9.0'\n", - "\n", - "!git clone -b $BRANCH https://github.com/NVIDIA/NeMo\n", - "%cd NeMo\n", - "!./reinstall.sh" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pynini\n", - "import nemo_text_processing\n", - "\n", - "from pynini.lib import pynutil" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, NEMO_DIGIT, delete_space, NEMO_SIGMA, NEMO_NOT_QUOTE, delete_extra_space, NEMO_NON_BREAKING_SPACE\n", - "from nemo_text_processing.text_normalization.normalize import Normalizer\n", - "\n", - "from nemo_text_processing.inverse_text_normalization.fr.taggers.cardinal import CardinalFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.taggers.decimal import DecimalFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.taggers.money import MoneyFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.taggers.ordinal import OrdinalFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.taggers.punctuation import PunctuationFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.taggers.time import TimeFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.taggers.whitelist import WhiteListFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.taggers.word import WordFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.cardinal import CardinalFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.decimal import DecimalFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.money import MoneyFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.ordinal import OrdinalFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.time import TimeFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.whitelist import WhiteListFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.word import WordFst\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T0JxcvuPHvn9" - }, - "source": [ - "NeMo's Text Processing module uses Weighted Finite State Transducers (WFST) to deploy grammars for both efficient text normalization (TN) and inverse text normalization (ITN). In this tutorial, you will learn to build a normalization grammar from the ground up to use in your own text processing tasks. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Table of Contents\n", - "- WFSTs\n", - "- NeMo Text Processing\n", - "- Getting Started\n", - "- Cardinal WFST\n", - "- Ordinal WFST\n", - "- Decimal WFST\n", - "- Money WFST\n", - "- Time WFST\n", - "- WhiteList WFST\n", - "- Word and Punctuation WFST\n", - "- Other Classes\n", - "- Tokenize and Classify\n", - "- Verbalize and Verbalize Final\n", - "- Deployment" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lMUovcMsfXyI" - }, - "source": [ - "# WFSTs " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Y1ejNMLbH1jM" - }, - "source": [ - "WFSTs are a form of [Finite State Machines](https://en.wikipedia.org/wiki/Finite-state_machine) used to graph relations between regular languages (or [regular expressions](https://en.wikipedia.org/wiki/Regular_expression)). For our purposes, they can be defined by two major properties:\n", - "\n", - "1. Mappings between accepted input and output expressions for text substitution\n", - "2. Path weighting to direct graph traversal" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nNg45ZuaP_A8" - }, - "source": [ - "For example, consider a simple normalization task of mapping the word \"cent\" (French for \"one hundred\") to the numerical representation `100`. We would begin with a Finite State representation of the regex `/cent/`:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uxo7gUkW_XKT" - }, - "source": [ - "![cent.png](images/cent.PNG)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fahsjMVFlbCa" - }, - "source": [ - "And then create a mapping to the text string `100`:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IMJ-fNSk_jXC" - }, - "source": [ - "![cent_to_100.png](images/cent_to_100.PNG)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bPKW0I4yAGUb" - }, - "source": [ - "*Note: Null characters are expressed as `ε` by convention*" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_0NK3aW5nG9C" - }, - "source": [ - "This would give us a WFST with universal path weights. (By default, `pynini` uses [tropical semirings](https://en.wikipedia.org/wiki/Tropical_semiring) for arcs, giving each arc a default weight of `0`.)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CzBc9D3qTGJ-" - }, - "source": [ - "Now, let us consider expanding our model. To indicate values between `100` and `200`, French uses the number scheme of `cent + digit`. For example, `120` would be pronounced as \"cent-vingt\". To create the appropriate output string, we would now want to map \"cent\" to `1` and the remaining aspect of our string to the appropriate digit representation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GRrKNQRjFDoL" - }, - "source": [ - "![cent_vingt_to_120.png](images/cent_vingt_to_120.PNG)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jLpm4mufAfUz" - }, - "source": [ - "However this would make our graph [non-deterministic](https://en.wikipedia.org/wiki/Nondeterministic_algorithm) - it will have multiple possibilities for termination. Now an input of \"cent-vingt\" could have the outcome of `100` or `10020` when only one is correct. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![cent_vingt_bad.png](images/cent_vingt_bad.PNG)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "c-GJTpgIAf7S" - }, - "source": [ - "To correct this, we may add a new end state and a weight to the path that accepts the input without `s`:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6GJcsdttGg_S" - }, - "source": [ - "![cent_vingt_good.png](images/cent_vingt_good.PNG)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mHft1gzsAipc" - }, - "source": [ - "Now, we can guarantee an ideal mapping by relying on a shortest-path (smallest-weight) heuristic: traversal of the graph will prioritize longer inputs, only converting \"cent\" to `100` when a larger input isn't available. As such, we've now removed the undesired output `10020` while preserving our desired coverage in string mapping. \n", - "\n", - "This use of weights to ensure predictable behavior allows WFSTs to exploit the efficiency of standard graph traversal algorithms while also maintaining versatility. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8Ik4PBXafSSB" - }, - "source": [ - "# NeMo Text Processing " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "b2fcWKhqYVF5" - }, - "source": [ - "Following [Google's Kestrel](https://www.researchgate.net/publication/277932107_The_Kestrel_TTS_text_normalization_system) framework, NeMo deploys two composite WFSTs for text normalization. They are as follows:\n", - "1. A *classifier* (or tagger) to label potential tokens by 'semiotic class' (e.g. currency, ordinal number, street address)\n", - "2. A *verbalizer* to render a tagged token in conventional written form\n", - "\n", - "For example, consider the sentence: <>\n", - "\n", - "For an ITN task, a tokenizer would identify the following tokens:\n", - "\n", - "`[\"le\" ,\"premier\", \"juillet\", \"il\", \"a\", \"mangé\", \"trente-cinq\", \"pommes\"]`\n", - "\n", - "and provide each a class token: \n", - "\n", - "- `tokens { name: \"le\" }`\n", - "- `tokens { date { day: \"1\" month: \"juillet\" } } ` \n", - "- `tokens { name: \"il\" }` \n", - "- `tokens { name: \"a\" }` \n", - "- `tokens { name: \"mangé\" }`\n", - "- `tokens { cardinal { integer: \"35\" } }` \n", - "- `tokens { name: \"pommes\" }`\n", - "\n", - "These tokens are then passed to a 'verbalizer' WFST, which renders each token in a conventional written form:\n", - "\n", - "- `tokens { name: \"le\" }` -> `le` \n", - "- `tokens { date { day: \"1\" month: \"juillet\" } } ` -> `1ᵉʳ` \n", - "- `tokens { name: \"il\" }` -> `juillet`\n", - "- `tokens { name: \"il\" }` -> `il` \n", - "- `tokens { name: \"a\" }` -> `a`\n", - "- `tokens { name: \"mangé\" }` -> `mangé` \n", - "- `tokens { cardinal { integer: \"35\" } }` -> `35` \n", - "- `tokens { name: \"pommes\" }` -> `pommes`\n", - "\n", - "and merged into a normalized string:\n", - "\n", - "`le 1ᵉʳ juillet il a mangé 35 pommes`\n", - "\n", - "With the equivalent TN task being the reverse process. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_n-5JExAbvwr" - }, - "source": [ - "A few things to note: \n", - "- Each class token has a unique set of field names that must be parsed by the classifier. The default field names for NeMo are chosen to mirror the syntax in [Sparrowhawk](https://github.com/google/sparrowhawk) to enable deployment. If these fields are not exact, you will not be able to use Sparrowhawk.\n", - "- NeMo assumes no punctuation (unless explicitly provided in the grammar) and all lower casing to ease integration with upstream ASR.\n", - "- The `name` class token is default for any token that does not require processing. It will be left 'as is.'\n", - "- You may note how the tokenizer performed the conversion of `premier` to `1` while the verbalizer normalized `1` -> `1ᵉʳ`. Such decisions are implementation dependent and will vary depending on preference and language. (That is, normalization from `premier` -> `1ᵉʳ` could have been a tokenization step.)\n", - "- By default, NeMo will create several permutations of key values in a token to ease normalization. That is, given the token `tokens { date { day: \"1\" month: \"juillet\" } }`, it will also produce paths for `tokens { date { month: \"juillet\" day: \"1\" } }`. To prevent this and avoid ambiguity in verbalizer input, tokens can be assigned a `preserve_order` attribute to prevent permutation. (e.g. `tokens { date { day: \"1\" month: \"juillet\" preserve_order: true } }`) (We will discuss this [later in the tutorial](#verbalizer).)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## WFST Classes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "NeMo Text Processing's base languages currently support only the following semiotic classes to permit integration with Sparrowhawk deployment. \n", - "\n", - "- CARDINAL\n", - "- ORDINAL\n", - "- DECIMAL\n", - "- FRACTION\n", - "- MEASURE\n", - "- MONEY\n", - "- TIME\n", - "- DATE\n", - "- ELECTRONIC\n", - "- TELEPHONE\n", - "- WHITELIST\n", - "- WORD\n", - "- PUNCTUATION\n", - "\n", - "For this tutorial, we will be focusing on the following classes:\n", - "- CARDINAL\n", - "- ORDINAL\n", - "- DECIMAL\n", - "- MONEY\n", - "- TIME\n", - "- WHITELIST\n", - "- WORD\n", - "- PUNCTUATION\n", - "\n", - "While not comprehensive, these classes will provide enough foundation and exposure to edge cases that you will feel comfortable constructing for other cases.\n", - "\n", - "**NOTE**: *If you intend to only develop for personal use with NeMo, you may rename these classes as desired. However, Sparrowhawk integration\n", - "REQUIRES use of only these tags and their assigned attributes. For list of Sparrowhawk tokens and attributes, [consult the Sparrowhawk repository](https://github.com/yzhang123/sparrowhawk/blob/test/src/proto/semiotic_classes.proto)*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Further Reading" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you wish to learn more about NeMo Text Processing, you may wish to consult the following:\n", - "- [Y. Zhang, E. Bakhturina, K. Gorman, and B. Ginsburg, \"NeMo Inverse Text Normalization: From Development To Production\"](https://arxiv.org/pdf/2104.05055.pdf)\n", - "- [NeMo's Text Normalization Documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nemo_text_processing/intro.html) \n", - "- [NeMo's Text Normalization Deployment Documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tools/text_processing_deployment.html)\n", - "- NeMo's [Text Normalization Tutorial](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Text_Normalization.ipynb) or [Inverse Text Normalization](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Inverse_Text_Normalization.ipynb) tutorials\n", - "- [Sparrowhawk Documentation](https://github.com/google/sparrowhawk)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For further information regarding WFSTs, please see:\n", - "- [D. Jufasky and J. Martin, *Natural Language Processing*, Ch. 2](https://web.stanford.edu/~jurafsky/slp3/2.pdf)\n", - "- [K. Gorman and R. Sproat, *Finite-State Text Processing*](http://www.morganclaypoolpublishers.com/catalog_Orig/product_info.php?products_id=1636)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XFdXRcnUfI25" - }, - "source": [ - "# Getting Started \n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "K3Zl3VwqdYqL" - }, - "source": [ - "To begin tokenizer development, make sure you have [installed NeMo from source](https://github.com/NVIDIA/NeMo)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rGg7Bf13FXgc" - }, - "source": [ - "For this tutorial, we will focus on developing an Inverse Text Normalization system, such as one you may encounter in downstream ASR processing. As such, we will navigate to\n", - "`nemo_text_processing/inverse_text_normalization` and create a directory for our target language (French) and subdirectories\n", - "for `taggers` and `verbalizers`. You may also wish to create a `data` subdirectory to ease navigation.\n", - "\n", - "(Note, for text normalization, the suggested directory structure would be the same within the `nemo_text_processing/text_normalization` folder. In fact, many of NeMo's grammars actively share between.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "T58E4pU4FN3A" - }, - "outputs": [], - "source": [ - "LANGUAGE= \"MY_LANGUAGE\" # Change this to your desired language" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_PAyEPhaFpCD", - "outputId": "23d034d2-8c93-4e8b-e3ce-5ba9e870f82d" - }, - "outputs": [], - "source": [ - "%cd nemo_text_processing/inverse_text_normalization/\n", - "!mkdir {LANGUAGE}\n", - "!mkdir \"{LANGUAGE}/taggers\"\n", - "!mkdir \"{LANGUAGE}/verbalizers\"\n", - "!mkdir \"{LANGUAGE}/data\"\n", - "%cd {LANGUAGE}\n", - "!pwd && ls" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dependencies" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "O1vfz-bUFpwz" - }, - "source": [ - "All WFSTs deployed in NeMo inherit from the `GraphFst` class.\n", - "While in most cases you can simply import from a pre-existing `graph_utils.py`, you may occasionally find it helpful for deployment to keep a copy \n", - "in your working directory for language specific edits. (For our purposes, we will be utilizing `nemo_text_processing.text_normalization.en.graph_utils`, which serves as default for NeMo's grammars.)\n", - "\n", - "You may also wish to keep a copy of `utils.py` (found in each language system's directory)\n", - "in your working directory to assist with pathing. (Make sure to adjust the imports towards your language.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3OME84EmOQ4h", - "outputId": "6eea17f9-aae9-4176-ae35-3d1f0e94b4ea" - }, - "outputs": [], - "source": [ - "!cp ../../text_normalization/en/graph_utils.py .\n", - "!cp ../../text_normalization/en/utils.py .\n", - "! cd ../../.." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For development, we utilize `nemo_text_processing` and `pynini` (a Python library for efficient WFST construction and traversal, installed with `NeMo-toolkit` by default). " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "While this tutorial will attempt to make use of `pynini` tools transparent, it does assume some familiarity with its syntax. For a more in-depth guide, the following will provide a function overview:\n", - "\n", - "- [K. Gorman, Pynini: A Python library for weighted finite-state grammar compilation](https://aclanthology.org/W16-2409.pdf)\n", - "- [K. Gorman, Pynini Tutorial](http://wellformedness.com/courses/pynini/)\n", - "- [Pynini Documentation](https://www.openfst.org/twiki/bin/view/GRM/PyniniDocs) " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will also import the `pynutil` module for access to some extra functionality, along with writing a simple helper function for printing `pynini` graphs through the previously discussed 'shortest-path' heuristic." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sz18Ui8-8Kf4" - }, - "outputs": [], - "source": [ - "from pynini.lib import pynutil\n", - "\n", - "def apply_fst(text, fst):\n", - " \"\"\" Given a string input, returns the output string\n", - " produced by traversing the path with lowest weight.\n", - " If no valid path accepts input string, returns an\n", - " error.\n", - " \"\"\"\n", - " try:\n", - " print(pynini.shortestpath(text @ fst).string())\n", - " except pynini.FstOpError:\n", - " print(f\"Error: No valid output with given input: '{text}'\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Cardinal WFST " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rOyLZb9DgLoh" - }, - "source": [ - "The vast majority of ITN tasks require the ability to recognize and denormalize numbers. As such, we will begin with developing a Classifier and Verbalizer for Cardinal (integer) numbers. (e.g. `-3,-2,-1,0,1,2,3,4,5....99,100,101...`)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9GZQkH1V89kh" - }, - "source": [ - "## Grammar" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will begin by first constructing a Cardinal WFST, using French as an example language. While your target language will obviously differ greatly from our example, you will likely find some several similarities, such as:\n", - "- Use of a (semi) regular decimal (base-10) counting system. (A common - but not universal - feature of natural languages.)\n", - "- Incorporation of several irregularities requiring contingencies in our WFST construction. (e.g. a pseudo vigesimal (base-20) series.)\n", - "- Use of gender and number agreement in enumeration." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Digits" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NzJ2DIwc_TT3" - }, - "source": [ - "We shall begin with the first decimal place. As these numbers serve as the building blocks for the rest of our WFST, we shall begin by explicitly calling their WFST mappings with `pynini.string_map`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "u0H4qg4BjYfB" - }, - "outputs": [], - "source": [ - "zero = pynini.string_map([(\"zéro\",\"0\")]) # French only pronounces zeroes as stand alone\n", - "digits = pynini.string_map([ # pynini function that creates explicit input-output mappings for a WFST\n", - "\t\t\t\t(\"un\",\"1\"),\n", - "\t\t\t\t(\"une\",\"1\"),\n", - "\t\t\t\t(\"deux\",\"2\"),\n", - "\t\t\t\t(\"trois\",\"3\"),\n", - "\t\t\t\t(\"quatre\",\"4\"),\n", - "\t\t\t\t(\"cinq\",\"5\"),\n", - "\t\t\t\t(\"six\",\"6\"),\n", - "\t\t\t\t(\"sept\",\"7\"),\n", - "\t\t\t\t(\"huit\",\"8\"),\n", - "\t\t\t\t(\"neuf\",\"9\")\n", - "])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0nHjY-NNjdWQ" - }, - "source": [ - "We may also simply write a `tsv` file in a separate data folder " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- zéro\t0\n", - "- un\t1\n", - "- une\t1\n", - "- deux\t2\n", - "- trois\t3\n", - "- quatre\t4\n", - "- cinq\t5\n", - "- six\t6\n", - "- sept\t7\n", - "- huit\t8\n", - "- neuf\t9" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xicKcZLEzQTg" - }, - "source": [ - "and import with `string_file`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`digits = pynini.string_file(\"data/digits.tsv\")`\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If utils.py is in working directory you may also use `get_abs_path`, which will always call paths relative to your {LANGUAGE} directory:\n", - "\n", - "`from nemo_text_processing.inverse_normalization.{LANGUAGE}.utils import get_abs_path`\n", - "\n", - "`digits = pynini.string_file(get_abs_path(\"data/digits.tsv\"))`" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yPccmicQkYAB" - }, - "source": [ - "While we will use `string_map` throughout this tutorial, please note that NeMo employs the later option for maintainability and recommends its use instead." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Teens" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FQJiJcVMrNmC" - }, - "source": [ - "Let us consider our next set of numbers:\n", - "- 10 - dix\n", - "- 11 - onze\n", - "- 12 - douze\n", - "- 13 - treize\n", - "- 14 - quatorze\n", - "- 15 - quinze\n", - "- 16 - seize\n", - "- 17 - dix-sept\n", - "- 18 - dix-huit\n", - "- 19 - dix-neuf\n", - "\n", - "Like before, we can simply use `string_map` to compose a WFST for them. But note how there is some redundancy in the number set: `17`, `18`, and `19` are all of the form `dix + digit`. It would be more efficient simply to reuse our prior WFST in these cases than simply creating new arcs, states, and weights. \n", - "\n", - "We can achieve this using pynini's string concatenation function to extend the accepted input strings. First we will create an WFST for `11-16`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "orSgBwyXsfY5" - }, - "outputs": [], - "source": [ - "teens = pynini.string_map([\n", - "\t\t\t\t(\"onze\",\"11\"),\n", - "\t\t\t\t(\"douze\",\"12\"),\n", - "\t\t\t\t(\"treize\",\"13\"),\n", - "\t\t\t\t(\"quatorze\",\"14\"),\n", - "\t\t\t\t(\"quinze\",\"15\"),\n", - "\t\t\t\t(\"seize\",\"16\"),\n", - "])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "s1yIgigdtriQ" - }, - "source": [ - "Now, we will create a `tens` WFST that is responsible for mapping all instances of \"dix\" and concatenate (accomplished with the overloaded `+` operator) with the prior `digits` WFST. (Deleting any possible hyphens in-between with a build in `delete_hyphen`.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "CzwZrFCkt87W" - }, - "outputs": [], - "source": [ - "tens = pynini.string_map([(\"dix\", \"1\")])\n", - "delete_hyphen = pynini.closure(pynutil.delete(\"-\"), 0, 1) # Applies a closure from 0-1 of operation. Equivalent to regex /?/\n", - "\n", - "graph_tens = tens + delete_hyphen + digits" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2knCwybmuTDn" - }, - "source": [ - "We now can combine the `teens` and `graph_tens` WFST together through the union operation (done with the overloaded `|` operator), allowing our choice of either graph." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WIRJ4PE7uRrl" - }, - "outputs": [], - "source": [ - "graph_tens_and_teens = graph_tens | teens" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TGkzKoeuxbeA" - }, - "source": [ - "Let's see if it works through the string function:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "v2iD0_HnxdUV", - "outputId": "1d8f434f-ff8a-4c85-b8d0-1127e4587ddf" - }, - "outputs": [], - "source": [ - "apply_fst(\"dix-huit\", graph_tens_and_teens)\n", - "apply_fst(\"seize\", graph_tens_and_teens)\n", - "apply_fst(\"dix\", graph_tens_and_teens)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Yh2f-3rux8_2" - }, - "source": [ - "The first two worked, but why did we get an error with \"dix\"? If you look back, you'll notice that while our graph has a mapping from \"dix\" to `1` - the concatenation with `digits` makes the assumption that some input from those strings will follow. That is, we left no opportunity for an *omission* of `digits`.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "OM_eJYlV1UVp" - }, - "source": [ - "![dix_to_digits.png](images/dix_to_digits.PNG)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "M4xCMKRA1Wzw" - }, - "source": [ - "You may also note that this issue would hold also if we wanted to normalize only digits - our graph would error out since it's expecting a `tens` or input first. \n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XJHnlJCm1dPv" - }, - "source": [ - "We can fix both of these problems by allowing an option to simply insert a zero without any extra input. (Much like our \"cent\" example.)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9_vvJ9Bl1dYQ" - }, - "source": [ - "![dix_to_digits_with_insert.png](images/dix_to_digits_with_insert.PNG)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hJq3uoMN2OcC" - }, - "source": [ - "This may be accomplished through use of the `pynutil.insert` function:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7h9xuNfA081P" - }, - "outputs": [], - "source": [ - "graph_digits = digits | pynutil.insert(\"0\") # inserts zero if no digit follows" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fA_L_6Ky2SHm" - }, - "source": [ - "And for `graph_tens`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jelVA81o2RXu" - }, - "outputs": [], - "source": [ - "tens = tens | pynutil.insert(\"0\") | tens + delete_hyphen\n", - "graph_tens = tens + graph_digits" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Gb5uhpGr3I4X" - }, - "source": [ - "Bringing everything together:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "bLkDddkA3Stu" - }, - "outputs": [], - "source": [ - "graph_teens_and_tens = graph_tens | teens\n", - "graph_all = graph_teens_and_tens | zero " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DESDKScv3r3P" - }, - "source": [ - "Let us now check our tests:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "7wrDNXuD3oh9", - "outputId": "661d2526-5aa0-4640-9285-bca15cd56c75" - }, - "outputs": [], - "source": [ - "apply_fst(\"dix-huit\", graph_all) \n", - "apply_fst(\"seize\" , graph_all)\n", - "apply_fst(\"dix\" , graph_all) \n", - "apply_fst(\"une\" , graph_all) \n", - "apply_fst(\"trois\" , graph_all) \n", - "apply_fst(\"quatre\" , graph_all) \n", - "apply_fst(\"zéro\" , graph_all)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Tz_k3NoB66Bv" - }, - "source": [ - "Now we have no more error - albeit at the cost of leading zeroes. (We will take care of this later in the section.)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Tens" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2dJZAhE57an3" - }, - "source": [ - "Now that we've taken care of the teens, we can proceed with the rest of the tens. Like many languages, French employs a (fairly) regular schema of: `tens_digit + ones_digit` for 20-100. Indeed, we can summarize 20-69 in the following template:\n", - "\n", - "- 20 - vingt\n", - "- 21 - vingt-et-un\n", - "- 22 - vingt-deux\n", - "- 23 - vingt-trois\n", - "- 24 - vingt-quatre\n", - "- 25 - vingt-cinq\n", - "- 26 - vingt-six\n", - "- 27 - vingt-sept\n", - "- 28 - vingt-huit\n", - "- 29 - vingt-neuf\n", - "- 30 - trente\n", - "- 31 - trente-et-un\n", - "- 32 - trente-deux\n", - "- 33 - trente-trois\n", - "...\n", - "- 40 - quarante\n", - "...\n", - "- 50 - cinquante\n", - "...\n", - "- 60 - soixante\n", - "..." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BuaxVG35UKcs" - }, - "source": [ - "Expanding `tens` is fairly easy to accommodate this template: we simply extend our earlier `string_map` for the new terms in the 'tens place.' From there, we once again concatenate the `digits` WFST (along with a simple WFST to delete the occurrence of the \"-et-\" term that occasionally occurs.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qAnXlRkR32wt" - }, - "outputs": [], - "source": [ - "tens = pynini.string_map([\n", - "\t\t\t\t(\"dix\", \"1\"),\n", - "\t\t\t\t(\"vingt\",\"2\"),\n", - "\t\t\t\t(\"trente\",\"3\"),\n", - "\t\t\t\t(\"quarante\",\"4\"),\n", - "\t\t\t\t(\"cinquante\",\"5\"),\n", - "\t\t\t\t(\"soixante\",\"6\"),\n", - "\t\t])\n", - "\n", - "graph_et = pynutil.delete(\"-et-\")\n", - "\n", - "tens = tens | pynutil.insert(\"0\") | tens + pynutil.delete(\"-\") | tens + graph_et\n", - "\n", - "graph_tens = tens + graph_digits\n", - "graph_teens_and_tens = graph_tens | teens\n", - "graph_all = graph_teens_and_tens | zero " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-hJwqPDx8I2R" - }, - "source": [ - "#### Special Cases: 70-99" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zvBLvJdY9XPA" - }, - "source": [ - "However, things get tricky once we go beyond the 60s. Here, standard French possesses a notorious psuedo-vigecimal (base-20) system. For numbers 70-99:\n", - "\n", - "- 70 - soixante-dix <- Literally in English: \"sixty-ten\"\n", - "- 71 - soixante-et-onze <- Literally in English: \"sixty-and-eleven\"\n", - "- 72 - soixante-douze\n", - "- 73 - soixante-treize\n", - "- 74 - soixante-quatorze\n", - "- 75 - soixante-quinze\n", - "- 76 - soixante-seize\n", - "- 77 - soixante-dix-sept\n", - "- 78 - soixante-dix-huit\n", - "- 79 - soixante-dix-neuf\n", - "- 80 - quatre-vingts <- Literally in English: \"four-twenties\"\n", - "- 81 - quatre-vingt-un\n", - "- 82 - quatre-vingt-deux\n", - "- 83 - quatre-vingt-trois\n", - "- 84 - quatre-vingt-quatre\n", - "- 85 - quatre-vingt-cinq\n", - "- 86 - quatre-vingt-six\n", - "- 87 - quatre-vingt-sept\n", - "- 88 - quatre-vingt-huit\n", - "- 89 - quatre-vingt-nuef\n", - "- 90 - quatre-vingt-dix <- Literally in English: \"four-twenties-ten\"\n", - "- 91 - quatre-vingt-onze\n", - "- 92 - quatre-vingt-douze\n", - "- 93 - quatre-vingt-treize\n", - "- 94 - quatre-vingt-quatorze\n", - "- 95 - quatre-vingt-quinze\n", - "- 96 - quatre-vingt-seize\n", - "- 97 - quatre-vingt-dix-sept\n", - "- 98 - quatre-vingt-dix-huit\n", - "- 99 - quatre-vingt-dix-neuf" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HQNiwFDyVV_3" - }, - "source": [ - "As before, we want to take advantage of as much redundancy as we can without creating additional ambiguities that will impede graph traversal. \n", - "\n", - "We first note that - despite repeating prior words - \"quatre-vingt\" can be mapped to `8` without introducing ambiguity. This is because, despite \"quatre\" and \"vingt\" being present in our prior graphs, our WFST has no pathing for them in this exact order. As such, we can simply add it to `tens` and immediately improve our coverage for 81-89. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "AvJqaHhE9Wbd" - }, - "outputs": [], - "source": [ - "tens = pynini.string_map([\n", - "\t\t\t\t(\"dix\", \"1\"),\n", - "\t\t\t\t(\"vingt\",\"2\"),\n", - "\t\t\t\t(\"trente\",\"3\"),\n", - "\t\t\t\t(\"quarante\",\"4\"),\n", - "\t\t\t\t(\"cinquante\",\"5\"),\n", - "\t\t\t\t(\"soixante\",\"6\"),\n", - " (\"quatre-vingt\", \"8\")\n", - "\t\t])\n", - "tens = tens | pynutil.insert(\"0\") | tens + delete_hyphen | tens + graph_et\n", - "graph_tens = tens + graph_digits\n", - "graph_teens_and_tens = graph_tens | teens\n", - "graph_all = graph_teens_and_tens | zero " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0_DtcpZxZTzX" - }, - "source": [ - "Of course, now we permit the occurrence of:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "V2leANlDhCvj", - "outputId": "db8d5d02-c848-4e50-df23-d8499538281c" - }, - "outputs": [], - "source": [ - "apply_fst(\"quatre-vingt\", graph_all)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_X_ef3sihCHH" - }, - "source": [ - "which is invalid (French uses the plural \"quatre-vingt**s**\" here.) " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vgKT903Y6rIQ" - }, - "source": [ - "Should we alter the grammar because of this? Such a decision will largely be dependent on your intended implementation and design aims. If you see the question of 'legal' tokens as a responsibility of your upstream model, then there is no need for any alteration: \"quatre-vingt\" as a standalone token will simply not occur, so there is no input to be concerned with.\n", - "\n", - "However, if your ITN grammars are developed for an environment with low-fidelity ASR and/or where mistaken transcriptions incur heavy loss (e.g. ASR for driving directions, telephone-numbers, banking) then you may wish to err on the side of caution." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Hf_FghLT7jdY" - }, - "source": [ - "If we wanted to go for the latter, we would want to mark that \"quatre-vingts\" maps **only** to `80`. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JliFTF3mZSsJ" - }, - "outputs": [], - "source": [ - "quatre_vingt_plural = pynini.string_map([\n", - " (\"quatre-vingts\", \"80\")\n", - "\t\t])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "81_b3XPbicT1" - }, - "source": [ - "And that \"quatre vingt\" can only accompany non-zero digits:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "E4_dmg6uin2j" - }, - "outputs": [], - "source": [ - "quatre_vingt_singular = pynini.string_map([\n", - " (\"quatre-vingt-\", \"8\") # Note that the hyphen can be assumed now\n", - "\t\t])\n", - "graph_digits_without_zero = pynini.string_map([\n", - "\t\t\t\t(\"un\",\"1\"),\n", - "\t\t\t\t(\"une\",\"1\"),\n", - "\t\t\t\t(\"deux\",\"2\"),\n", - "\t\t\t\t(\"trois\",\"3\"),\n", - "\t\t\t\t(\"quatre\",\"4\"),\n", - "\t\t\t\t(\"cinq\",\"5\"),\n", - "\t\t\t\t(\"six\",\"6\"),\n", - "\t\t\t\t(\"sept\",\"7\"),\n", - "\t\t\t\t(\"huit\",\"8\"),\n", - "\t\t\t\t(\"neuf\",\"9\")\n", - "])\n", - "graph_eighties = (quatre_vingt_singular + graph_digits_without_zero) | quatre_vingt_plural" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mL7jpekV8VgP" - }, - "source": [ - "For the `70`'s and `90`'s, we would likewise need to form exclusive configurations for their number series, rewriting digits to recognize \"onze\", \"douze\", \"treize\"... as `1,2,3....` (Note, we'll have to separate `71` and `91` to manage \"soixante-**et**-onze\" vs. \"quatre-vingt-onze\".)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y3dYkwK29zCX" - }, - "outputs": [], - "source": [ - "seventy_and_ninety = pynini.string_map([\n", - " (\"soixante-dix\", \"70\"),\n", - " (\"quatre-vingt-dix\", \"90\"),\n", - "\t\t])\n", - "\n", - "seventy_and_ninety_tens = pynini.string_map([\n", - " (\"soixante-\", \"7\"),\n", - " (\"quatre-vingt-\", \"9\"),\n", - "\t\t])\n", - "\n", - "seventy_and_ninety_one = pynini.string_map([\n", - " (\"soixante-et-onze\", \"71\"),\n", - " (\"quatre-vingt-onze\", \"91\"),\n", - "\t\t])\n", - "\n", - "seventy_and_ninety_digits = digits = pynini.string_map([ \n", - "\t\t\t\t(\"douze\",\"2\"),\n", - "\t\t\t\t(\"treize\",\"3\"),\n", - "\t\t\t\t(\"quatorze\",\"4\"),\n", - "\t\t\t\t(\"quinze\",\"5\"),\n", - "\t\t\t\t(\"seize\",\"6\"),\n", - "\t\t\t\t(\"dix-sept\",\"7\"), # For 97-99, digits are used as normal.\n", - "\t\t\t\t(\"dix-huit\",\"8\"),\n", - "\t\t\t\t(\"dix-neuf\",\"9\")\n", - "])\n", - "\n", - "graph_seventies_and_nineties = (seventy_and_ninety_tens + seventy_and_ninety_digits) | seventy_and_ninety | seventy_and_ninety_one " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4NCrCwEH9HVg" - }, - "source": [ - "Now we union them with our original `tens` series:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "psGCgxaH-btn" - }, - "outputs": [], - "source": [ - "tens = pynini.string_map([\n", - "\t\t\t\t(\"dix\", \"1\"),\n", - "\t\t\t\t(\"vingt\",\"2\"),\n", - "\t\t\t\t(\"trente\",\"3\"),\n", - "\t\t\t\t(\"quarante\",\"4\"),\n", - "\t\t\t\t(\"cinquante\",\"5\"),\n", - "\t\t\t\t(\"soixante\",\"6\"),\n", - "\t\t])\n", - "tens = tens | pynutil.insert(\"0\") | tens + delete_hyphen | tens + graph_et\n", - "\n", - "graph_tens = tens + graph_digits\n", - "graph_tens_with_special_cases = graph_tens | graph_seventies_and_nineties | graph_eighties\n", - "graph_teens_and_tens = graph_tens_with_special_cases | teens\n", - "graph_all = graph_teens_and_tens | zero " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xWjSAGRX_s0H" - }, - "source": [ - "Making sure test cases work:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "kapWmgos-xcn", - "outputId": "5e9c6f5c-1450-495f-cadf-2945355b651c" - }, - "outputs": [], - "source": [ - "apply_fst(\"quatre-vingt-treize\" , graph_all)\n", - "apply_fst(\"quatre-vingts\", graph_all)\n", - "apply_fst(\"quatre-vingt-deux\", graph_all)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hNUepfKZ_vS_" - }, - "source": [ - "And the other cases fail as expected:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "wo2pCOXGAgYn", - "outputId": "0bbe2792-8bc9-40f7-dd28-4745bd1390e3" - }, - "outputs": [], - "source": [ - "apply_fst(\"quatre-vingt\", graph_all)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4VPuCTTtigh-" - }, - "source": [ - "Of course, there are other ways we could have reconfigured the grammar: we could simply make specific graphs for multiples of ten (`10,20,30..`) and all cases where \"-et-\" occurs (`21,31,41,51...91`). \n", - "\n", - "But this ignores a more important question: was any of this necessary in the first place? All these extra grammars did was simply expand coverage for thirty additional cardinals. And they still didn't exclude all faulty inputs! Note the following cases:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "KICvpeewCFyH", - "outputId": "174dd910-7329-4a5f-a5b0-5e796a174217" - }, - "outputs": [], - "source": [ - "apply_fst(\"dix-une\", graph_all) # supposed to be \"onze\"\n", - "apply_fst(\"dix-deux\", graph_all) # supposed to be \"douze\"\n", - "apply_fst(\"vingt-un\", graph_all) # supposed to be \"vingt-et-un\"\n", - "apply_fst(\"trente-un\", graph_all) # supposed to be \"trente-et-un\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0D130jIVCLp2" - }, - "source": [ - "We *still* need to address possible edge cases!\n", - "\n", - "All of this is to say that knowing your input domain before construction is imperative, as small decisions can easily determine your output range later down the line.\n", - "\n", - "Indeed, if you're particularly concerned with limiting input possibilities, it may be valid simply to write all unique options within a `string_map`. While a tad inelegant, it certainly assists in controlling your outputs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "RSp9w5ayA9ii" - }, - "outputs": [], - "source": [ - "graph_tens_special = pynini.string_map([\n", - "\t\t\t\t(\"soixante-dix\", \"70\"),\n", - "\t\t\t\t(\"soixante-et-onze\",\"71\"),\n", - " (\"soixante-douze\",\"72\"),\n", - "\t\t\t\t(\"soixante-treize\",\"73\"),\n", - "\t\t\t\t(\"soizante-quatorze\",\"74\"),\n", - "\t\t\t\t(\"soixante-quinze\",\"75\"),\n", - "\t\t\t\t(\"soixante-seize\",\"76\"),\n", - " (\"soixante-dix-sept\",\"77\"),\n", - " (\"soixante-dix-huit\",\"78\"),\n", - "\t\t\t\t(\"soixante-dix-neuf\",\"79\"),\n", - " (\"quatre-vingts\", \"80\"),\n", - " (\"quatre-vingt-un\", \"81\"),\n", - " (\"quatre-vingt-une\", \"81\"),\n", - "\t\t\t\t(\"quatre-vingt-deux\",\"82\"),\n", - " (\"quatre-vingt-trois\",\"83\"),\n", - " (\"quatre-vingt-quatre\",\"84\"),\n", - " (\"quatre-vingt-cinq\",\"85\"),\n", - " (\"quatre-vingt-six\",\"86\"),\n", - " (\"quatre-vingt-sept\",\"87\"),\n", - " (\"quatre-vingt-huit\",\"88\"),\n", - " (\"quatre-vingt-neuf\",\"89\"),\n", - " (\"quatre-vingt-dix\",\"90\"),\n", - " (\"quatre-vingt-onze\",\"91\"),\n", - " (\"quatre-vingt-douze\",\"92\"),\n", - " (\"quatre-vingt-treize\",\"93\"),\n", - " (\"quatre-vingt-quatorze\",\"94\"),\n", - " (\"quatre-vingt-quinze\",\"95\"),\n", - " (\"quatre-vingt-sieze\",\"96\"),\n", - " (\"quatre-vingt-dix-sept\",\"97\"),\n", - " (\"quatre-vingt-dix-huit\",\"98\"),\n", - " (\"quatre-vingt-dix-neuf\",\"99\"),])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NUPs1qOUg-hE" - }, - "source": [ - "Which is more efficient? Once again, it is dependent on your language and implementation. If we simply visualize each graph and their number of states:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "sQ9GsIkNzxsU", - "outputId": "d70ca927-9c43-4f49-846c-c181e725e011" - }, - "outputs": [], - "source": [ - "constructed_version = (graph_seventies_and_nineties | graph_eighties)\n", - "constructed_version.num_states()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Xsgdu5TYx09_", - "outputId": "5812912f-883b-42e8-afbf-3ec4a0170345" - }, - "outputs": [], - "source": [ - "string_map_version = graph_tens_special\n", - "string_map_version.num_states()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9jzn_U7s0Sit" - }, - "source": [ - "We see that their number of states (graph vertexes) are almost equal. Yet, if we use `pynini.optimize` - a method that calls a suite of WFST minimization algorithms: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "7YtqhOY90iF0", - "outputId": "26f0f51b-b00d-4f5a-9b2f-330c9812666a" - }, - "outputs": [], - "source": [ - "constructed_version.optimize()\n", - "constructed_version.num_states()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "y93SqnOf0qa8", - "outputId": "74efcbfa-a272-4fc6-e36e-f1e31c6df221" - }, - "outputs": [], - "source": [ - "string_map_version.optimize()\n", - "string_map_version.num_states()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2cTdQj9L0xhl" - }, - "source": [ - "We see the latter possessing a significantly larger amount of graph vertices. \n", - "\n", - "So the decision will be dependent on your ITN needs, language, concern with efficiency, and design philosophy. Further, even decisions of language dialect will have an influence. \n", - "(e.g. Belgian, Canadian, and Swiss dialects of French will dispense with elements of the vigecimal system for the decimal schema.)\n", - "\n", - "**N.B.** *For reference: while `nemo_text_processing` grammars aim to minimize invalid productions, they assume input tokens are valid strings for a target language. (e.g. The mapping of \"quatre-vingt\" to `80` is permitted since it is not likely to occur in a valid French string.)* " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "V1djCnvY3CjW" - }, - "source": [ - "For more information on optimization algorithms for WFSTs, please see:\n", - "\n", - "- [M. Mohri,\"Generic epsilon-removal and input epsilon-normalization algorithms for weighted transducers\"](https://cs.nyu.edu/~mohri/pub/ijfcs.pdf)\n", - "- [M. Mohri, \"Weighted automata algorithms\"](https://cs.nyu.edu/~mohri/pub/hwa.pdf)\n", - "- [K. Thompson, \"Programming techniques: regular expression search algorithm\"](http://www.oilshell.org/archive/Thompson-1968.pdf)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hundreds\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dqPUdVBbi6gU" - }, - "source": [ - "\n", - "Moving on to the case of three digit cardinals (\"hundreds\"), it is likely that your chosen language becomes more regular in its schema. For instance, practically all French numbers `>100` obey the following:\n", - "\n", - "- `digit_from_1_to_9 + word_for_hundred + digit_from_1_to_99`\n", - "\n", - "For example:\n", - "- `203` - \"deux-cent-trois\"\n", - "- `530` - \"cinq-cent-trente\"\n", - "- `880` - \"huit-cent-quatre-vingt\"\n", - "\n", - "As such, we can write a simple `hundreds` WFST as:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lOt-gc-FiF-X" - }, - "outputs": [], - "source": [ - "hundreds = graph_digits + delete_hyphen + pynutil.delete(\"cent\") + delete_hyphen + graph_all" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Fyn1uL_NoEiz", - "outputId": "d491680b-1b3e-4762-8470-497833b82b0e" - }, - "outputs": [], - "source": [ - "apply_fst(\"deux-cent-trois\", hundreds)\n", - "apply_fst(\"huit-cent-quatre-vingts\", hundreds)\n", - "apply_fst(\"cinq-cent-trente\" , hundreds) " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qDjq_KfnoD5C" - }, - "source": [ - "Indeed, the use of French only presents two complications:\n", - "- French uses *only* the word \"cent\" for `100`. (Instead of \"un cent\".)\n", - "- 'Pure' multiples of a hundred (`200,300,400....`) use the plural \"cents\".\n", - "\n", - "The second one is the easier of the two so let's start there. There are actually two options open to us. First, we could treat \"cents\" the same way as we did \"cent\" in the base case and simply delete it. From there, the lack of any following inputs will allow the WFST to insert the trailing zeroes as appropriate." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "m2F-sumbxqLE" - }, - "outputs": [], - "source": [ - "cents = pynini.accep(\"cent\") | pynini.accep(\"cents\") # Creates a Finite State (Accep)tor, mapping inputs back to themselves\n", - "hundreds = graph_digits + delete_hyphen + pynutil.delete(cents) + delete_hyphen + graph_all" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VisQu_Etx-QB" - }, - "source": [ - "Or we can use it as a cue to 'shortcut' the WFST to immediately insert zeroes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VspiTN5Vxxjl" - }, - "outputs": [], - "source": [ - "graph_cents = pynini.cross(\"cents\", \"00\") # Creates a single input-output mapping\n", - "hundreds = graph_digits + delete_hyphen + ((pynutil.delete(\"cent\") + delete_hyphen + graph_all) | graph_cents)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "meVn5BiyyX5v" - }, - "source": [ - "For the case of solitary \"cent\", we need to make sure our output is `1` only in the case that no digit precedes the occurrence. Here we need to be confident in the structure of our WFST and that any possible ambiguity has been dealt with by this point. (Something to keep in mind as we move to the thousands.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "277Z-zLWyWAf" - }, - "outputs": [], - "source": [ - "graph_cent = pynini.cross(\"cent\", \"1\")\n", - "graph_hundreds_first_digit = (graph_digits + delete_hyphen + pynutil.delete(cents)) | graph_cent\n", - "graph_hundreds = graph_hundreds_first_digit + delete_hyphen + graph_all" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "FNZlJsvS_Yvt", - "outputId": "e85ae561-e7a1-4b6a-e394-f0194fdb89e7" - }, - "outputs": [], - "source": [ - "apply_fst(\"trois-cents\", graph_hundreds) \n", - "apply_fst(\"cent\", graph_hundreds)\n", - "apply_fst(\"cent-trois\", graph_hundreds) " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Thousands" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e7Dy5slLzp-K" - }, - "source": [ - "For quite a few languages, managing the WFST for the thousands place is the last aspect to figure out, as the higher powers of ten reuse the same schema. (For those working with counting systems that reserve special terms for \"ten-thousand\" (e.g. Chinese derived counting systems), you may need to extend unique coverage to the next power of ten.)\n", - "\n", - "For French, the question of thousands is rather simple: `digits_from_1_to_999 + mille + digits_from_1_to_999`\n", - "\n", - "With only the exception that any expression of one thousand drops a leading digit. \n", - "- `1,000` -> \"mille\"\n", - "- `1,001` -> \"mille-un\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "AvsnAAiPzlu_" - }, - "outputs": [], - "source": [ - "graph_one_thousand = pynini.cross(\"mille\", \"1\")\n", - "graph_many_thousand = graph_hundreds + delete_hyphen + pynutil.delete(\"mille\")\n", - "\n", - "graph_thousands = (graph_one_thousand | graph_many_thousand) + delete_hyphen + graph_hundreds" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "i3m9TG7Y4tkl", - "outputId": "d3f1f81d-c463-4934-9df7-3b8f2b67798f" - }, - "outputs": [], - "source": [ - "apply_fst(\"cent-mille-deux-cents\", graph_thousands)\n", - "apply_fst(\"deux-cent-mille-deux-cents\", graph_thousands)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NoevSTZGGT17" - }, - "source": [ - "### Weighting" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "A2gcVIZM0-iv" - }, - "source": [ - "Question: will this cover all our grammar so far? (Hint: what assumptions were made about \"cent\"/\"cents\"?)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "cCFtPhr1BjAc", - "outputId": "048e0d93-a4a8-4f4e-d461-bfd70e911aff" - }, - "outputs": [], - "source": [ - "apply_fst(\"deux-mille-un\", graph_thousands)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Ne-7L9Cd4t-8" - }, - "source": [ - "Once again, we need to introduce the possibility of the prior power of ten not occurring in the string. There must be an option for simply inserting a string of `0` in place of the omitted \"cent\"." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iockqXdn-aG4" - }, - "source": [ - "Further, we want to be careful with how cavalier we have been with insertions. Consider the following:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "bxJlSnj2-Xw3", - "outputId": "6722e5ef-8a7f-43e1-84fe-b3f5f18307e1" - }, - "outputs": [], - "source": [ - "apply_fst(\"mille-cent-un\", graph_thousands) # Should be 1101\n", - "apply_fst(\"mille-cent\", graph_thousands) # 1100" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fq5zEayA-kOx" - }, - "source": [ - "It appears that our WFST has developed a tendency to simply 'ignore' some of these higher powers. Let us return to our code for `graph_hundreds` and `graph_thousands`. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "S2aV1KQ4-1iP" - }, - "outputs": [], - "source": [ - "graph_cents = pynini.cross(\"cents\", \"00\")\n", - "graph_cent = pynini.cross(\"cent\", \"1\")\n", - "graph_hundreds_first_digit = (graph_digits + delete_hyphen + pynutil.delete(cents)) | graph_cent\n", - "graph_hundreds = (graph_hundreds_first_digit + delete_hyphen | pynutil.insert(\"0\")) + graph_all \n", - "\n", - "graph_one_thousand = pynini.cross(\"mille\", \"1\")\n", - "graph_many_thousand = graph_hundreds + delete_hyphen + pynutil.delete(\"mille\")\n", - "graph_thousands = (graph_one_thousand | graph_many_thousand) + delete_hyphen + graph_hundreds" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9avwOIkk-9qt" - }, - "source": [ - "Recall that throughout we have provided options for simply inserting zeroes in the case of omitted numbers? That tendency has finally caught up with us. The use of our previous `graph_hundreds` in `graph_many_thousands` now allows our graph to insert a string of `0`'s without penalty. \n", - "\n", - "You may note that this is very similar to the \"cents\" example brought up at the beginning, presenting a similar solution. We can control this output by making it too costly to traverse unless absolutely necessary for the graph. This can be accomplished simply by appending a weight to the insertion for hundreds:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MQG3j0U8CUAQ" - }, - "outputs": [], - "source": [ - "graph_hundreds = (graph_hundreds_first_digit + delete_hyphen | pynutil.insert(\"0\", weight=.1)) + graph_all \n", - "\n", - "graph_one_thousand = pynini.cross(\"mille\", \"1\")\n", - "graph_many_thousand = graph_hundreds + delete_hyphen + pynutil.delete(\"mille\")\n", - "graph_thousands = (graph_one_thousand | graph_many_thousand) + delete_hyphen + graph_hundreds" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "KNHhrYZ7Ca58", - "outputId": "a7d07372-733d-4837-c1e9-1dc58ba2b87c" - }, - "outputs": [], - "source": [ - "apply_fst(\"mille-cent-un\", graph_thousands)\n", - "apply_fst(\"mille-cent\", graph_thousands)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "51yPEaf2EkbD" - }, - "source": [ - "Why choose a weight of `.1`? Quite simply: it's arbitrary. As mentioned earlier, the default graph in `pynini` is a tropical semiring, which uses the `min` function to select among two arcs for path traversal. Since all our paths so far are weight `0`, any positive value will ensure that it is a last option among path traversal. (Note, this conversely entails any negative weight path will be prioritized.)\n", - "\n", - "That we chose this number as a small value comes from a place of caution: the tropical semiring uses an additive function to calculate the total weight of an entire path to traverse a WFST. As our grammars can easily become massive, this means that small weights can have major impact down the line. Further, by constraining path weights to small values, we can have general certainty towards the maximum weight of any individual graph, allowing us to add constraints regarding maximum token length and token hierarchy. (As explained in [later sections](#classifyweights).) As such, when using weights in a localized setting, it is best to use small values to avoid unforeseen escalation. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iScKgvRxGt-B" - }, - "source": [ - "### Higher Powers\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rtHEd6OE2WSg" - }, - "source": [ - "At this point, we can propose a general heuristic with escalating to higher powers of ten: they always need a way for their absence to be accommodated in the WFST. Further, they require some weighting to prevent this absence from developing into a string of omitted values. To avoid further bumps, we'll take care of this now with `graph_thousands`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "iZMN7wcE2lH5" - }, - "outputs": [], - "source": [ - "graph_one_thousand = pynini.cross(\"mille\", \"1\")\n", - "graph_many_thousand = graph_hundreds + delete_hyphen + pynutil.delete(\"mille\")\n", - "graph_thousands = (graph_one_thousand | graph_many_thousand | pynutil.insert(\"000\", weight=.001)) + delete_hyphen + graph_hundreds" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Fkc3LIH824P7" - }, - "source": [ - "\n", - "For the rest of French (and many other languages), the rest of the work is simply repeating the prior pattern for the thousands element: \n", - "`hundreds + word_for_higher_power + hundreds.....` Of course there will be some variation in this schema, but the recursion should be regular. (It is rather rare that languages appropriate unique forms for these higher counts.) " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qGnK4ARX4Nay" - }, - "source": [ - "To finish French, we can list off the following equivalent for higher powers of ten:\n", - "- `million` - \"million/millions\" \n", - "- `billion` - \"milliard/milliards\"\n", - "- `trillion` - \"billion/billions\"\n", - "\n", - "Like the \"cent/cents\" rule, these values alternate with a plural form in the case of multiples of the value. Writing them out:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sBu7-dub4vxz" - }, - "outputs": [], - "source": [ - "millions = pynini.accep(\"million\") | pynini.accep(\"millions\")\n", - "graph_millions = ((graph_hundreds + delete_hyphen + pynutil.delete(millions) + delete_hyphen) | pynutil.insert(\"000\", weight=.1) # We need three zeroes now\n", - " ) + graph_thousands" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "LmMeCHXr5Bb5" - }, - "outputs": [], - "source": [ - "billions = pynini.accep(\"milliards\") | pynini.accep(\"milliard\")\n", - "graph_billions = ((graph_hundreds + delete_hyphen + pynutil.delete(billions) + delete_hyphen)| pynutil.insert(\"000\",weight=.1) # We need three zeroes now\n", - " ) + graph_millions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "CIRIeQEg5B0J" - }, - "outputs": [], - "source": [ - "trillions = pynini.accep(\"billion\") | pynini.accep(\"billions\")\n", - "graph_trillions = ((graph_hundreds + delete_hyphen + pynutil.delete(trillions) + delete_hyphen) | pynutil.insert(\"000\",weight=.1) # We need three zeroes now\n", - " ) + graph_billions" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sRNUPx-15J1v" - }, - "source": [ - "Bringing all together:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0dLOWm_B5SwQ" - }, - "outputs": [], - "source": [ - "graph = graph_trillions | zero" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nBFE3BrN6IPR" - }, - "source": [ - "Let's try it out:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6lWwtR1S6LI4", - "outputId": "3a6740ee-9e92-4500-c2c8-965131167e58" - }, - "outputs": [], - "source": [ - "example = \"deux-cent-milliard-quatre-million-deux-cent-quatre-vingt-onze\"\n", - "apply_fst(example, graph) " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Finishing Touches" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-w3KgX6C6mff" - }, - "source": [ - "Now that we have our cardinal in place, we can take care of that stylistic issue of the leading zeroes. For this, we want to develop a 'filter' that deletes all zeroes preceding the first non-zero in the string, and leave the rest 'as is.'\n", - "\n", - "First let us create the filter by calling on `NEMO_DIGIT`- a `graph_util` WFST that only permits digits as input. With it, we'll create a WFST that will delete all leading zeroes in a sting. We then compose this (using `@`) onto our original graph, creating a new graph that accepts inputs from our original but produces only the outputs of `clean_cardinal`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 290 - }, - "id": "EA4VnRe6FO-2", - "outputId": "59e412b3-a445-4172-ee64-b0f80281a167" - }, - "outputs": [], - "source": [ - "delete_leading_zeroes = pynutil.delete(pynini.closure(\"0\")) # will delete all zeroes under closure. Equivalent to regex * operator\n", - "stop_at_non_zero = pynini.difference(NEMO_DIGIT, \"0\") # creates a graph that accepts all input-outputs from NEMO_DIGIT except 0\n", - "rest_of_cardinal = pynini.closure(NEMO_DIGIT) # accepts all digits that may follow\n", - "\n", - "clean_cardinal = delete_leading_zeroes + stop_at_non_zero + rest_of_cardinal\n", - "clean_cardinal = clean_cardinal | \"0\" # We don't want to ignore the occurrence of zero\n", - "\n", - "graph = graph @ clean_cardinal " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "piP9nqQkHpo3" - }, - "source": [ - "Now our WFST will output our numbers as normal:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dnQ9odSpIAB7" - }, - "outputs": [], - "source": [ - "apply_fst(example, graph)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Final Notes\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "p7zt8lVsK2rY" - }, - "source": [ - "We have finally formulated a grammar that will process French cardinals into numeric representation. Of course, not every grammar you write will be for French. But several of the principles we've worked through will be invaluable in your own development. Before moving on, here's a quick summary of (almost) universal points to take away for WFST construction.\n", - "- Decide at the beginning of construction the level of constraint you wish for your grammar. Is it necessary to have a specific domain or can you rely on upstream models to narrow your input possibilities for you? \n", - "- Work iteratively upwards from the smallest place value of your numeric system. This will assist you in forming building blocks for larger values. \n", - "- Always allow for the possibility of omission of previous place values. (Not every number in the thousands will contain mention of the hundreds place.)\n", - "- For each place value, consider how the sub-grammar will affect the preceding and following place values. Are there exceptions that you've built into the grammar that may become problematic later on?\n", - "- Utilize weights for default insertions to limit path traversal to only final options. When doing so, use small values to avoid escalating problems in your larger grammar." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nvyHg1bQIIHD" - }, - "source": [ - "With that handled, we can move on to converting this grammar into a Classifier." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gJ1YJUvhIZwm" - }, - "source": [ - "## Classifier" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "q2L2x0crIeXQ" - }, - "source": [ - "Now that we have a grammar that will convert individual tokens into number strings, we now want to focus on building it into a classifier to properly tag candidate tokens. This requires a couple of properties:\n", - "- It recognizes any valid token and permits traversal through the WFST graph\n", - "- Conversely, it does not allow invalid tokens to traverse the WFST graph\n", - "- It properly disambiguates overlap among ambiguous cases\n", - "- It attributes the proper attributes to a classified token\n", - "\n", - "While this seems like a lot, in practice this just means that your grammar will need a few more tweaks to improve exclusivity." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ArEYn7RWKcYI" - }, - "source": [ - "NeMo ITN performs token classification through a series of `GraphFst` classes and assumes deployment of your grammars through an object that inherits from this class. As such, you will need to instantiate your grammar as a `CardinalFST` " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 368 - }, - "id": "GWgMSybqLqiS", - "outputId": "597c00ae-0f62-417f-888c-88c81c24a3fc" - }, - "outputs": [], - "source": [ - "class CardinalFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"cardinal\", kind=\"classify\")\n", - " # Rest of the grammar here\n", - " # ....... \n", - " #........." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SIE8dNQlL52G" - }, - "source": [ - "While the naming convention may vary, the `name` and `kind` properties must be set accordingly to permit Sparrowhawk integration.\n", - "\n", - "Further, the resulting graph must produce the classified token within the following format:\n", - "`token { cardinal { integer: \"DIGIT_STRING\" } }`\n", - "\n", - "This is accomplished by a series of string insertions:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "aC_c64KSNTCg" - }, - "outputs": [], - "source": [ - "class CardinalFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"cardinal\", kind=\"classify\")\n", - " # Rest of the grammar here\n", - " # ....... \n", - " #.........\n", - " self.fst = pynutil.insert(\"integer: \\\"\") + graph + pynutil.insert(\"\\\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AGLQxOSzOK1F" - }, - "source": [ - "Followed by a call of the parent `GraphFst.add_tokens()` method:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Jz-UXFipORps" - }, - "outputs": [], - "source": [ - "class CardinalFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"cardinal\", kind=\"classify\")\n", - " # Rest of the grammar here\n", - " # ....... \n", - " #.........\n", - " self.fst = pynutil.insert(\"integer: \\\"\") + graph + pynutil.insert(\"\\\"\")\n", - " final_graph = self.add_tokens(graph)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gh23S7BHOY0r" - }, - "source": [ - "Which will insert the appropriate formatting. Note that this formatting must be exact: a single space must follow each field name and each value must be within escaped double quotes.\n", - "\n", - "In the event that you also wish for `CardinalFst` to indicate negative values, the optional `negative: ` property may be used.\n", - "\n", - "For instance, French indicates negative values by prefacing the quantity with \"moins.\" As such:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3JbTn35cOx0k" - }, - "outputs": [], - "source": [ - "optional_minus_graph = pynini.closure(\n", - " pynutil.insert(\"negative: \") + pynini.cross(\"moins\", \"\\\"-\\\"\") + \" \", 0, 1 # Note the extra space to separate the value from the integer field\n", - ")\n", - "\n", - "final_graph = optional_minus_graph + pynutil.insert(\"integer: \\\"\") + graph + pynutil.insert(\"\\\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DCs1048v6N0K" - }, - "source": [ - "All together, your `CardinalFst` ultimately serves as a wrapper for your grammar, save with the addition of a few insertions to assist processing:\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "eo6uEz1s5TJY" - }, - "outputs": [], - "source": [ - "class CardinalFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"cardinal\", kind=\"classify\")\n", - " \n", - " ### Cardinal Grammar....\n", - " ### .....\n", - " graph = graph_trillions | zero \n", - "\n", - " ### Formatting grammar....\n", - " ### .....\n", - " graph = graph @ clean_cardinal\n", - "\n", - " ### Token insertion\n", - " optional_minus_graph = pynini.closure(\n", - " pynutil.insert(\"negative: \") + pynini.cross(\"moins\", \"\\\"-\\\"\") + \" \", 0, 1\n", - " )\n", - "\n", - " final_graph = optional_minus_graph + pynutil.insert(\"integer: \\\"\") + graph + pynutil.insert(\"\\\"\")\n", - "\n", - " final_graph = self.add_tokens(final_graph) # inserts the cardinal tag\n", - "\n", - " self.fst = final_graph" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MFIMdLCoZzLK" - }, - "source": [ - "Let's see a demonstration. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4CF6Iz9NZ7R_" - }, - "outputs": [], - "source": [ - "cardinal = CardinalFst().fst\n", - "\n", - "example = \"moins deux-cent-quatre\"\n", - "\n", - "apply_fst(example, cardinal)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Verbalizer" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uvUqpC_Q8FSt" - }, - "source": [ - "The verbalizer can be both the most crucial and simplest part of building each grammar. On one hand, it is the component that finalizes all of your previous work. If it is unable to properly normalize your text, everything has been for naught.\n", - "\n", - "On the other hand, your previous work has vastly limited the unpredictability of your input. Recall from our initial demonstration of the classifier-verbalizer system that and input like <> becomes:\n", - "\n", - "- `tokens { name: \"le\" }`\n", - "- `tokens { date { day: \"1\" month: \"juillet\" }` \n", - "- `tokens { name: \"il\" }` \n", - "- `tokens { name: \"a\" }` \n", - "- `tokens { name: \"mangé\" }`\n", - "- `tokens { cardinal { integer: \"35\" } }` \n", - "- `tokens { name: \"pommes\" }`\n", - "\n", - "Part of the purpose of the two stage set-up is that the input space for each verbalizer is obvious: it's simply the name of its semiotic class. As such, we only need to write our grammar to recognize its class, remove tokens accordingly, and then manage the attributes of each semiotic token." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "et1GgmBuAWzY" - }, - "source": [ - "We will begin as we did with our classifier and create a class to inherit from the `GraphFST` utility class:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NNKpgWtkAgEW" - }, - "outputs": [], - "source": [ - "class CardinalFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"cardinal\", kind=\"verbalize\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "OyAV39NsAqSN" - }, - "source": [ - "One of the useful aspects of the `GraphFst` utility is that it already possesses a built in graph that will recognize and remove semiotic tokens: `delete_tokens`. As such we need only concern ourselves with managing the properties of the Cardinal class:\n", - "- `integers`\n", - "- `negative`\n", - "\n", - "Here, the desired written format of your chosen language will dictate how you proceed. For French, we have the following rules for Cardinal numbers:\n", - "- A negative sign is written before the numeral.\n", - "- Cardinal numbers representing quantities (e.g. \"mille euros\"/ \"one thousand dollars\") are written with spaces in-between every three digits. (e.g. `1 000`)\n", - "- Cardinal numbers representing place in a sequence or addresses (\"page mille\"/\"page one thousand\") are written without spacing. (`1000`)\n", - "\n", - "The first property seems easy enough to handle: write a grammar that simply removes the `negative` formatting, leaving only `-`. (Recall that our Classifier only inserted the string if it was present.) \n", - "\n", - "For the final two, we may note that our intention to develop WFSTs for the Decimal, Measure, and Money classes already will cover most desired quantities. As such, we can leave the issue of spacing to those instances and let the Cardinal WFST default to the non-spacing case. (Note that this will be helpful with Time, Date, Telephone, Electronic, and Ordinal classes as they will not use the spacing format either. It is usually better to reserve specific formatting rules to other classes and let the Cardinal serve as a default.)\n", - "\n", - "As such, we just need our WFST to remove the `integer` property and `negative` property (if it occurs). These can be managed through the `pynutil.delete` function, as seen in the following:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 368 - }, - "id": "6MF2I6SLU7nf", - "outputId": "0437c4af-5c96-4122-8af0-ca37723c7228" - }, - "outputs": [], - "source": [ - "class CardinalFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"cardinal\", kind=\"verbalize\")\n", - " \n", - " # Removes the negative attribute and leaves the sign if occurs\n", - " optional_sign = pynini.closure(\n", - " pynutil.delete(\"negative:\")\n", - " + delete_space\n", - " + pynutil.delete(\"\\\"\")\n", - " + pynini.accep(\"-\")\n", - " + pynutil.delete(\"\\\"\")\n", - " + delete_space,\n", - " 0,\n", - " 1,\n", - " )\n", - " \n", - " # removes integer aspect\n", - " graph = (\n", - " pynutil.delete(\"integer:\")\n", - " + delete_space\n", - " + pynutil.delete(\"\\\"\")\n", - " + pynini.closure(NEMO_DIGIT, 1) # Accepts at least one digit\n", - " + pynutil.delete(\"\\\"\")\n", - " )\n", - " \n", - " graph = optional_sign + graph # concatenates two properties\n", - "\n", - " delete_tokens = self.delete_tokens(graph) # removes semiotic class tag\n", - "\n", - " self.fst = delete_tokens.optimize()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QSX2KlZJbRAA" - }, - "source": [ - "Let's see if it will properly render a given token:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JxaLm2k0bYIJ" - }, - "outputs": [], - "source": [ - "cardinal = CardinalFst().fst\n", - "example = 'cardinal { negative: \"-\" integer: \"204\" }'\n", - "\n", - "apply_fst(example, cardinal)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Bc0-QCBHWg-8" - }, - "source": [ - "That's it! We've now completed all aspects of our `CardinalFst` from grammar writing to Verbalization. While we still have quite a few semiotic classes left, you will find that they build off the `CardinalFst` quite easily, making progression much simpler and straightforward.\n", - "\n", - "Before proceeding, there are two things to note:\n", - "- `delete_tokens` is called on the completed graph, despite the token class occurring first in the tokenized string. This is because the function intersects with an initial WFST that deletes the tags. As such, the function must be passed a completed graph.\n", - "- In our initial example, all tokens were enclosed within a `token` category. Insertion and deletion of this category is managed by the main [Classifier](#tokenize-and-classify) and [Verbalizer](#verbalize-and-verbalize-final) respectively and is not a concern during individual class grammar development.\n", - "- Earlier in the tutorial we noted that NeMo ITN permutates all WFSTs unless the `preserve_order` tag is passed as part of the Classifier. This allows you to ignore possible variation in designing the verbalizer and focus on whatever form of processing is easiest for the grammar. That is, the decision to process the `negative` property before the `integer` property is not chosen because of a consequence of the French language but instead because it is easier to write out with `pynini`. \n", - "- Conversely, if your language is completely invariant in this regard, it may be more efficient to pass `preserve_order` through the Classifier and manage the property here in the Verbalizer. This allows NeMo ITN to avoid building states and arcs for each permutation, reducing graph size and compiling time." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aFUrbSdJ8Wk7" - }, - "source": [ - "# Ordinal WFST " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "w1b0Z7f5Z9Ar" - }, - "source": [ - "Ordinals is the class of numbers used for enumerating order or placement of entities in a series. In some languages, they are simply derivations of cardinal numbers. For instance, English enumerates order as `first, second, third, fourth, fifth....` After the third ordinal, they become a regular pattern of `cardinal + 'th'`.\n", - "\n", - "Meanwhile, other languages may reserve specific counting systems for ordinals. For example, while Korean uses a Chinese derived counting system for several Cardinal related tasks, it uses derivations from a native counting system for ordering:\n", - "\n", - "**Cardinal**/**Ordinal** = **English**\n", - "- il/cheot-jae = \"First\"\n", - "- i/dul-jae = \"Second\"\n", - "- sam/set-jae = \"Third\"\n", - "- sa/net-jae = \"Fourth\"\n", - "- o/daseot-jae = \"Fifth\"\n", - "\n", - "If your language is of the latter variety, you will likely need to begin development of Ordinal WFST by repeating Cardinal WFST development before proceeding. (Or make it part of your previous Cardinal WFST and combining with a `union` operation.) While you can extend coverage to the level of Cardinal WFST, you will find most Ordinals to be sufficiently covered by only enumerating to a few hundreds. (e.g. Is it common in your language to speak of the \"one millionth\" in an order and/or write out `1,000,000th`?)\n", - "\n", - "For this portion of the tutorial, we will focus on the first type of ordinals - those that primarily derived by altering Cardinals." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oq_xA8NPiANw" - }, - "source": [ - "## Grammar" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lhjcQS6oiD_w" - }, - "source": [ - "Continuing with our example language, we first begin by laying out our expected inputs and pinpointing a regular pattern to guide our WFSTs. We note the following examples:\n", - "\n", - " **English = French**\n", - " - \"first\" = \"premier/première\"\n", - " - \"second\" = \"second/seconde/deuxième\"\n", - " - \"third\" = \"troisième\"\n", - " - \"fourth\" = \"quatrième\"\n", - " - \"fifth\" = \"cinquième\"\n", - " - \"sixth\" = \"sixième\"\n", - " - \"seventh\" = \"septième\"\n", - "\n", - "From our examples inputs, it appears that spelling of French Ordinals follows a general format of: `cardinal + ième`. The only exceptions appear to be in the case of the first and second Ordinals - for which completely different roots appear - and the fourth and the fifth Ordinals - where the former drops the \"e\" at the end of the root (`quatre -> quatr`) and the latter appends a \"u\" (`cinq -> cinqu`). \n", - "\n", - "For the expected outputs, we observe the following examples:\n", - " - \"premier/première\" -> `1ᵉʳ/1ʳᵉ`\n", - " - \"second/seconde\" -> `2ᵈ/2ᵈᵉ`\n", - " - \"deuxième\" -> `2ᵉ`\n", - " - \"troisième\" -> `3ᵉ`\n", - " - \"quatrième\" -> `4ᵉ`\n", - " - \"cinquième\" -> `5ᵉ`\n", - " - \"sixième\" -> `6ᵉ`\n", - " - \"septième\" -> `7ᵉ`\n", - "\n", - "It appears that the output is simply the cardinal number of the root with an associated superscript. Since we have already constructed the Cardinal WFST, this means that the job of constructing an Ordinal WFST is simply a case of recognizing the cardinal root for the input and then utilizing a preconstructed Cardinal grammar to render the proper form alongside an associated superscript. That is, our tasks are to:\n", - "- Identify the proper superscript for the ordinal\n", - "- Change the ordinal back into a cardinal\n", - "- Use the Cardinal WFST to transform the cardinal into normalized form\n", - "- Properly render the ordinal using the normalized cardinal and proper superscript\n", - "\n", - "As information regarding the superscript will need to be conveyed through development of the Classifier, we will begin with creating the grammar necessary for rendering the ordinal as its cardinal root. \n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AOUVZhiwT7hE" - }, - "source": [ - "### Stripping Suffixes" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5nw0_lOTsEik" - }, - "source": [ - "Since French forms Ordinals by appending a suffix to Cardinals, we should start by creating a WFST to remove the suffix. Assuming that our grammar processes one token at a time, this means that we just need an WFST that will accept all tokens that end with \"ième\" and then delete the suffix from that token:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Rk89LhsxsHTO" - }, - "outputs": [], - "source": [ - "strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", - "graph_strip_morpheme = NEMO_SIGMA + strip_morpheme # accepts all strings until passed suffix, then deletes suffix" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pLg-PzdntV4N" - }, - "source": [ - "Now we can create a graph that permits all characters in a word token and deletes the ordinal suffix. (Note that this also means that the graph won't accept tokens without the suffix, helping us avoid false inputs.) \n", - "\n", - "We can now intersect this graph with our Cardinal WFST to now strip the suffixes from ordinals and treat them as cardinals. However, recall that our `CardinalFst` also inserted its own class tag. Obviously, we do not want to do this here as it will disrupt the formatting of the token. Instead, we should create a new subgraph *within* the `CardinalFst` class that will only produce the cardinals without tokens." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class CardinalFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"cardinal\", kind=\"classify\")\n", - " \n", - " ### Cardinal Grammar....\n", - " ### .....\n", - " graph = graph_trillions | zero \n", - "\n", - " ### Formatting grammar....\n", - " ### .....\n", - " graph = graph @ clean_cardinal\n", - " \n", - " ### NEW GRAPH\n", - " self.just_cardinals = graph # will produce cardinals without formatting\n", - "\n", - " ### Token insertion\n", - " optional_minus_graph = pynini.closure(\n", - " pynutil.insert(\"negative: \") + pynini.cross(\"moins\", \"\\\"-\\\"\") + \" \", 0, 1\n", - " )\n", - "\n", - " final_graph = optional_minus_graph + pynutil.insert(\"integer: \\\"\") + graph + pynutil.insert(\"\\\"\")\n", - "\n", - " final_graph = self.add_tokens(final_graph)\n", - "\n", - " self.fst = final_graph" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we call it for our graph:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vxDgBa4_t1nD" - }, - "outputs": [], - "source": [ - "graph_cardinal = CardinalFst().just_cardinals \n", - "graph_ordinal_regular_suffix = graph_strip_morpheme @ graph_cardinal" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hSpk5M7BuXRz" - }, - "source": [ - "Let's see if it works and gives us the desired cardinal:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7cJ7fieouY2r" - }, - "outputs": [], - "source": [ - "example = \"sixième\" # dervied from six/6\n", - "apply_fst(example, graph_ordinal_regular_suffix)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GtEuV7sOuxek" - }, - "source": [ - "Now we can consider the edge cases. Beyond the first and second ordinals, French exhibits irregular behavior in the following cases:\n", - "- If the cardinal root ends with an \"e\", the \"e\" is dropped before adding the suffix (e.g. \"quatrième\"). \n", - "- Cardinals ending with \"cinq\", \"neuf\", and \"dix\" change their endings to \"cinqu\", \"neuv\" , and \"diz\" before appending the suffix, respectively. \n", - "\n", - "We could start by proposing a WFST that replaces the suffix \"ième\" with \"e\" and then compose this onto the Cardinal WFST. If it is a legitimate cardinal, then there will be a path through CardinalFST and the integer will be rendered as normal. \n", - "\n", - "Meanwhile, the case of \"dix\", \"cinq\", and \"neuf\" would each require a distinct WFST as they are each a consequence of different rules of orthography and phonology. Like the case with \"e\", we could change each back to its root and then see if the CardinalWFST will permit a path with the new input. \n", - "\n", - "It is at this point that we can do a cost-benefit analysis and realize that all these cases can be managed by an explicit `string_map/string_file`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_9KTNQeIw4sq" - }, - "outputs": [], - "source": [ - "graph_root_change = pynini.string_map([(\"quatrième\", \"quatre\"),\n", - " (\"cinquième\",\t\"cinq\"),\n", - " (\"neuvième\",\t\"neuf\"),\n", - " (\"onzième\",\t\"onze\"),\n", - " (\"douzième\",\t\"douze\"),\n", - " (\"treizième\",\t\"treize\"),\n", - " (\"quatorzième\",\t\"quatorze\"),\n", - " (\"quinzième\",\t\"quinze\"),\n", - " (\"seizième\",\t\"seize\"),\n", - " (\"trentième\",\t\"trente\"),\n", - " (\"quarantième\",\t\"quarante\"),\n", - " (\"cinquantième\",\t\"cinquante\"),\n", - " (\"soixantième\",\t\"soixante\"),\n", - " (\"millième\",\t\"mille\"),\n", - "])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eo2_keFVqaY4" - }, - "source": [ - "We could then concatenate these with a WFST that accepts all tokens with these endings and then change the endings as desired. These will provide the cardinal roots just as effectively. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "O7I29ezmxylx" - }, - "source": [ - "The same can be said for \"premier/première\" and \"second/seconde\":" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3JZoz51VyGS6" - }, - "outputs": [], - "source": [ - "graph_firsts = pynini.string_map([(\"premier\", \"un\"),(\"première\", \"un\")])\n", - "graph_seconds = pynini.string_map([(\"second\", \"deux\"),(\"seconde\", \"deux\")])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NJ9BGGAwyTQ5" - }, - "source": [ - "*Note: We graph separately to manage their different superscripts later on.*\n", - "\n", - "Depending on your language of focus, the choice of implicitly reversing the root token or explicitly mapping back to root will be the most efficient, but it is worth considering both options if only to check your understanding of the language." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8PgVwDRRq9gr" - }, - "source": [ - "Putting our grammar together, we have:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ko2kAeKwrRSH" - }, - "outputs": [], - "source": [ - "strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", - "\n", - "graph_root_change = pynini.string_map([(\"quatrième\", \"quatre\"),\n", - " (\"cinquième\",\t\"cinq\"),\n", - " (\"neuvième\",\t\"neuf\"),\n", - " (\"onzième\",\t\"onze\"),\n", - " (\"douzième\",\t\"douze\"),\n", - " (\"treizième\",\t\"treize\"),\n", - " (\"quatorzième\",\t\"quatorze\"),\n", - " (\"quinzième\",\t\"quinze\"),\n", - " (\"seizième\",\t\"seize\"),\n", - " (\"trentième\",\t\"trente\"),\n", - " (\"quarantième\",\t\"quarante\"),\n", - " (\"cinquantième\",\t\"cinquante\"),\n", - " (\"soixantième\",\t\"soixante\"),\n", - " (\"millième\",\t\"mille\"),\n", - "])\n", - "\n", - "# Component will accept all tokens that end with desired strings\n", - "graph_get_cardinal = NEMO_SIGMA + (strip_morpheme | graph_root_change) \n", - "\n", - "graph_firsts = pynini.string_map([(\"premier\", \"un\"),(\"première\", \"un\")])\n", - "graph_seconds = pynini.string_map([(\"second\", \"deux\"),(\"seconde\", \"deux\")])\n", - "\n", - "graph_get_cardinal = pynini.union(graph_firsts, graph_seconds, graph_get_cardinal) \n", - "\n", - "graph_cardinal = CardinalFst().just_cardinals\n", - "\n", - "graph_ordinal = graph_get_cardinal @ graph_cardinal" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ESxY3LsCdE8q" - }, - "outputs": [], - "source": [ - "apply_fst(\"sixième\", graph_ordinal)\n", - "apply_fst(\"première\", graph_ordinal)\n", - "apply_fst(\"seconde\", graph_ordinal)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qo_g8UdoUFJB" - }, - "source": [ - "## Classifier" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kemhdKAjzEIa" - }, - "source": [ - "Now that we've found a way to pass the work of the Ordinal grammar back onto the Cardinal grammar, we can move onto the Classifier. Like before, we need to inherit from `GraphFst` to properly insert token formatting and required attributes. As well, we will again use the `integer` property to tag our digit string.\n", - "\n", - "Indeed, the only major difference between the Ordinal Classifier and the Cardinal Classifier is the replacement of optional `negative` attribute with the `morphosyntactic_feature` attribute to indicate the superscript function." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EHM4Y3TW2nXT" - }, - "source": [ - "Since we are relying on the `CardinalFst` class in our grammar, we want to consider how to instantiate an instance of it. Since our ultimate goal is to build a Classifier that unites all semiotic classes, it makes sense to simply use the `CardinalFst` that we will need to call for our ITN and pass it as an argument to our new class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 273 - }, - "id": "KsmPhWSa3LF_", - "outputId": "9e881ca9-a926-4249-dda8-9c52175569b5" - }, - "outputs": [], - "source": [ - "def __init__(self, cardinal: GraphFst):\n", - " super().__init__(name=\"ordinal\", kind=\"classify\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CtBQ-udB3S5Q" - }, - "source": [ - "To clear up the namespace, we will now be importing from the NeMo implementation of `CardinalFst` for French." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "L-JAcidf4QQg" - }, - "outputs": [], - "source": [ - "from nemo_text_processing.inverse_text_normalization.fr.taggers.cardinal import CardinalFst\n", - "\n", - "class OrdinalFst(GraphFst):\n", - " def __init__(self, cardinal: GraphFst):\n", - " super().__init__(name=\"ordinal\", kind=\"classify\")\n", - " graph_cardinal = cardinal.graph_no_exception # NeMo equivalent to self.just_cardinals" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FQfkAqZavCAB" - }, - "source": [ - "We now add in our grammar:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uUQ4BLuivGut" - }, - "outputs": [], - "source": [ - "class OrdinalFst(GraphFst):\n", - " def __init__(self, cardinal: GraphFst):\n", - " super().__init__(name=\"ordinal\", kind=\"classify\")\n", - " graph_cardinal = cardinal.graph_no_exception # may replace\n", - "\n", - " strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", - "\n", - " graph_root_change = pynini.string_map([(\"quatrième\", \"quatre\"),\n", - " (\"cinquième\",\t\"cinq\"),\n", - " (\"neuvième\",\t\"neuf\"),\n", - " (\"onzième\",\t\"onze\"),\n", - " (\"douzième\",\t\"douze\"),\n", - " (\"treizième\",\t\"treize\"),\n", - " (\"quatorzième\",\t\"quatorze\"),\n", - " (\"quinzième\",\t\"quinze\"),\n", - " (\"seizième\",\t\"seize\"),\n", - " (\"trentième\",\t\"trente\"),\n", - " (\"quarantième\",\t\"quarante\"),\n", - " (\"cinquantième\",\t\"cinquante\"),\n", - " (\"soixantième\",\t\"soixante\"),\n", - " (\"millième\",\t\"mille\"),\n", - " ])\n", - " \n", - " # Component will accept all tokens that end with desired strings\n", - " graph_get_cardinal = NEMO_SIGMA + (strip_morpheme | graph_root_change) \n", - "\n", - " graph_firsts = pynini.string_map([(\"premier\", \"un\"),(\"première\", \"un\")])\n", - " graph_seconds = pynini.string_map([(\"second\", \"deux\"),(\"seconde\", \"deux\")])\n", - "\n", - " graph_get_cardinal = pynini.union(graph_firsts, graph_seconds, graph_get_cardinal) \n", - "\n", - " graph_ordinal = graph_get_cardinal @ graph_cardinal\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F_6EXPRMvnp2" - }, - "source": [ - "Now we come to the `morphosyntactic_features` property - a linguistic term for aspects of a word related to grammar. If intending to deploy your WFST through Sparrowhawk, this is the only ordinal property that is permitted (outside of the universal properties like `preserve_order`) and thus must carry all information regarding how to properly normalize the ordinal. (If Sparrowhawk deployment is not necessary, you may add additional properties to the tag.)\n", - "\n", - "How should we convey this information? Since the Verbalizer will be the main interface for our tags, it really does not matter - so long as we can reliably process the features. For the purposes of French, we just need `morphosyntactic_features` to decide the following:\n", - "- Insert the specific superscripts for \"premier/première\" or \"second/seconde\"\n", - "- Insert \"ᵉ\" otherwise\n", - "\n", - "We will also introduce another aspect of French Ordinals: they can be either plural or singular, identified by the suffix \"s\" on input and superscript \"ˢ\" on output. As such, our `morphosyntactic_features` should also decide the additional property:\n", - "- Insert the plural superscript " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "atctz6p-2GtV" - }, - "source": [ - "Since the default superscript is near universal, we will just specify this in our WFST and focus on the second and first ordinals as specific cases. We will create a `graph_morpheme` component that inserts the default superscript - indicated with a standard \"e\" to avoid possible encoding issues. We will then append a WFST that will graph any possible plural marker - \"s\" - as part the `morphosyntactic_features`. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ui99osyP2UuQ" - }, - "outputs": [], - "source": [ - "graph_morpheme = pynutil.insert(\"e\") # Insert e superscript\n", - "graph_plural = pynini.closure(pynini.accep(\"s\"), 0, 1) # We create an acceptor since we must process the possible \"s\"\n", - "\n", - "graph_morpheme_component = graph_morpheme + graph_plural\n", - "\n", - "graph_morphosyntactic_features = (pynutil.insert(\" morphosyntactic_features: \\\"\") \n", - " + graph_morpheme_component\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QAlqubA25gq0" - }, - "source": [ - "Introducing the `integer` feature:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rs2TyIBc5la6" - }, - "outputs": [], - "source": [ - "graph_reg_ordinals = graph_get_cardinal @ graph_cardinal # Rewriting ordinals to remove the first and second ordinal.\n", - "\n", - "graph_ordinal = pynutil.insert(\"integer: \\\"\") + graph_reg_ordinals + pynutil.insert(\"\\\"\")\n", - "graph_ordinal += graph_morphosyntactic_features" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xoqk20Pi2gT8" - }, - "source": [ - "For the first and second ordinals, we can explicitly state their mappings, as these occurrences are invariable. (First and second ordinals do not need to accommodate being the endings of other terms.) As such, we can just have mappings from the token to the superscripts." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "54aqdH_P63Ea" - }, - "outputs": [], - "source": [ - "firsts = pynini.string_map([(\"premier\", \"er\"), (\"première\",\"re\")])\n", - "firsts += graph_plural # Still accepts plural marker in superscript\n", - "seconds = pynini.string_map([(\"second\", \"d\"),(\"seconde\", \"de\")])\n", - "seconds += graph_plural \n", - "\n", - "graph_firsts = pynutil.insert(\"integer: \\\"1\\\" morphosyntactic_features: \\\"\") + firsts\n", - "graph_seconds = pynutil.insert(\"integer: \\\"2\\\" morphosyntactic_features: \\\"\") + seconds" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "D2vQ4m7o7p84" - }, - "source": [ - "Placing them in our class:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "w_JKT8JMf-Mz" - }, - "outputs": [], - "source": [ - "class OrdinalFst(GraphFst):\n", - " def __init__(self, cardinal: GraphFst):\n", - " super().__init__(name=\"ordinal\", kind=\"classify\")\n", - " graph_cardinal = cardinal.graph_no_exception # may replace\n", - "\n", - " strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", - "\n", - " graph_root_change = pynini.string_map([(\"quatrième\", \"quatre\"),\n", - " (\"cinquième\",\t\"cinq\"),\n", - " (\"neuvième\",\t\"neuf\"),\n", - " (\"onzième\",\t\"onze\"),\n", - " (\"douzième\",\t\"douze\"),\n", - " (\"treizième\",\t\"treize\"),\n", - " (\"quatorzième\",\t\"quatorze\"),\n", - " (\"quinzième\",\t\"quinze\"),\n", - " (\"seizième\",\t\"seize\"),\n", - " (\"trentième\",\t\"trente\"),\n", - " (\"quarantième\",\t\"quarante\"),\n", - " (\"cinquantième\",\t\"cinquante\"),\n", - " (\"soixantième\",\t\"soixante\"),\n", - " (\"millième\",\t\"mille\"),\n", - " ])\n", - " \n", - " # Component will accept all tokens that end with desired strings\n", - " graph_get_cardinal = NEMO_SIGMA + (strip_morpheme | graph_root_change) \n", - "\n", - " # Graph will map ordinals beyond second ordinal to their cardinals\n", - " graph_reg_ordinals = graph_get_cardinal @ graph_cardinal\n", - "\n", - " # Graphing morphosyntactic_features\n", - " graph_morpheme = pynutil.insert(\"e\") # Insert e superscript\n", - " graph_plural = pynini.accep(\"s\").ques # ques is equivalent to pynini.closure(, 0, 1)\n", - "\n", - " graph_morpheme_component = graph_morpheme + graph_plural\n", - "\n", - " graph_morphosyntactic_features = (pynutil.insert(\" morphosyntactic_features: \\\"\") \n", - " + graph_morpheme_component\n", - " )\n", - "\n", - " # Adding in the `integer` property:\n", - " graph_ordinal = pynutil.insert(\"integer: \\\"\") + graph_reg_ordinals + pynutil.insert(\"\\\"\")\n", - " graph_ordinal += graph_morphosyntactic_features \n", - "\n", - " # Case of first and second ordinals\n", - " firsts = pynini.string_map([(\"premier\", \"er\"), (\"première\",\"re\")])\n", - " firsts += graph_plural # Still accepts plural marker in superscript\n", - " seconds = pynini.string_map([(\"second\", \"d\"),(\"seconde\", \"de\")])\n", - " seconds += graph_plural \n", - "\n", - " graph_firsts = pynutil.insert(\"integer: \\\"1\\\" morphosyntactic_features: \\\"\") + firsts\n", - " graph_seconds = pynutil.insert(\"integer: \\\"2\\\" morphosyntactic_features: \\\"\") + seconds\n", - "\n", - " # All together\n", - " graph_ordinal = pynini.union(graph_ordinal, graph_firsts, graph_seconds)\n", - " self.fst = graph_ordinal.optimize()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CpGHVg6chmA0" - }, - "source": [ - "Trying out on some examples:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "b5DL3PZRhpc8" - }, - "outputs": [], - "source": [ - "cardinal = CardinalFst()\n", - "ordinal = OrdinalFst(cardinal).fst\n", - "\n", - "apply_fst(\"premier\", ordinal)\n", - "apply_fst(\"premiers\", ordinal)\n", - "apply_fst(\"seconde\", ordinal)\n", - "apply_fst(\"douzièmes\", ordinal)\n", - "apply_fst(\"cent-cinquièmes\", ordinal)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MNQVgiv-UK29" - }, - "source": [ - "### Special Tokens" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UdiNAHGh71O9" - }, - "source": [ - "If you are particularly astute, you may have noticed that we have not closed the quotations around the `morphosyntactic_features` throughout, despite doing so for `integer`. This is not a typo, as there is one more aspect of the Classifier that must be addressed: special cases.\n", - "\n", - "For your language, you may notice that there are occasional exceptions to writing rules that are signaled by a specific vocabulary token in a string. As this must be communicated to our Verbalizer, it is important that we signal this vocabulary through our Classifier. \n", - "\n", - "For French, this can occur in the normalization of centuries. When using Ordinals to indicate centuries, French commonly writes with Roman numerals. For example:\n", - "- \"Fifth century\" -> \"cinquième siècle\" -> `Vᵉ siècle` \n", - "- \"Twentieth century\" -> \"vintième siècle\" -> `XXᵉ siècle` \n", - "\n", - "As such, we must allow our Classifier to pass on the information that \"siècle\" follows an ordinal to our Verbalizer, so it may normalize with Roman numerals. We accomplish this by appending a WFST that accepts special tokens that follow our Ordinals, adding them to our `morphosyntactic_features` attribute with a forward slash to delineate." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MsWnT4BfQKcC" - }, - "outputs": [], - "source": [ - "special_tokens = pynini.accep(\"siècle\")\n", - "\n", - "graph_special_tokens = delete_space + pynutil.insert(\"/\") + special_tokens # We need to delete the space in between this token and the following one.\n", - "graph_special_tokens = pynini.closure(graph_special_tokens, 0, 1)\n", - "\n", - "graph_ordinal += graph_special_tokens + pynutil.insert(\"\\\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "698_n5SFQ_jP" - }, - "source": [ - "*Once again, it is advised to retain a tsv file in `data` to quickly append these key-words.*\n", - "\n", - "Having taken care of the special case, we may now call `add_tokens` and complete the graph (fully written out below)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nZ1dkft0Riou" - }, - "outputs": [], - "source": [ - "class OrdinalFst(GraphFst):\n", - " def __init__(self, cardinal: GraphFst):\n", - " super().__init__(name=\"ordinal\", kind=\"classify\")\n", - " graph_cardinal = cardinal.graph_no_exception # may replace\n", - "\n", - " strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", - "\n", - " graph_root_change = pynini.string_map([(\"quatrième\", \"quatre\"),\n", - " (\"cinquième\",\t\"cinq\"),\n", - " (\"neuvième\",\t\"neuf\"),\n", - " (\"onzième\",\t\"onze\"),\n", - " (\"douzième\",\t\"douze\"),\n", - " (\"treizième\",\t\"treize\"),\n", - " (\"quatorzième\",\t\"quatorze\"),\n", - " (\"quinzième\",\t\"quinze\"),\n", - " (\"seizième\",\t\"seize\"),\n", - " (\"trentième\",\t\"trente\"),\n", - " (\"quarantième\",\t\"quarante\"),\n", - " (\"cinquantième\",\t\"cinquante\"),\n", - " (\"soixantième\",\t\"soixante\"),\n", - " (\"millième\",\t\"mille\"),\n", - " ])\n", - " \n", - " # Component will accept all tokens that end with desired strings\n", - " graph_get_cardinal = NEMO_SIGMA + (strip_morpheme | graph_root_change) \n", - "\n", - " # Graph will map ordinals beyond second ordinal to their cardinals\n", - " graph_reg_ordinals = graph_get_cardinal @ graph_cardinal\n", - "\n", - " # Graphing morphosyntactic_features\n", - " graph_morpheme = pynutil.insert(\"e\") # Insert e superscript\n", - " graph_plural = pynini.accep(\"s\").ques # We create an acceptor since we must process the possible \"s\"\n", - "\n", - " graph_morpheme_component = graph_morpheme + graph_plural\n", - "\n", - " graph_morphosyntactic_features = (pynutil.insert(\" morphosyntactic_features: \\\"\") \n", - " + graph_morpheme_component\n", - " )\n", - "\n", - " # Adding in the `integer` property:\n", - " graph_ordinal = pynutil.insert(\"integer: \\\"\") + graph_reg_ordinals + pynutil.insert(\"\\\"\")\n", - " graph_ordinal += graph_morphosyntactic_features \n", - "\n", - " # Case of first and second ordinals\n", - " firsts = pynini.string_map([(\"premier\", \"er\"), (\"première\",\"re\")])\n", - " firsts += graph_plural # Still accepts plural marker in superscript\n", - " seconds = pynini.string_map([(\"second\", \"d\"),(\"seconde\", \"de\")])\n", - " seconds += graph_plural \n", - "\n", - " graph_firsts = pynutil.insert(\"integer: \\\"1\\\" morphosyntactic_features: \\\"\") + firsts\n", - " graph_seconds = pynutil.insert(\"integer: \\\"2\\\" morphosyntactic_features: \\\"\") + seconds\n", - "\n", - "\n", - " # Special tokens\n", - " special_tokens = pynini.accep(\"siècle\")\n", - "\n", - " graph_special_tokens = delete_space + pynutil.insert(\"/\") + special_tokens # We need to delete the space in between this token and the following one.\n", - " graph_special_tokens = pynini.closure(graph_special_tokens, 0, 1)\n", - "\n", - " graph_ordinal += graph_special_tokens + pynutil.insert(\"\\\"\")\n", - "\n", - " # Finishing\n", - " graph_ordinal = self.add_tokens(graph_ordinal)\n", - " self.fst = graph_ordinal.optimize()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7a4zBo-YS1QD" - }, - "source": [ - "## Verbalizer" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zYbrcGyGS2rW" - }, - "source": [ - "The initial part of the Ordinal Verbalizer is similar to the Cardinal WFST: we simply need to build a Verbalizer that inherits from `GraphFST` and removes the `integer` property tag. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KUv99A_rYjb9" - }, - "outputs": [], - "source": [ - "class OrdinalFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"ordinal\", kind=\"verbalize\")\n", - " graph_integer = (\n", - " pynutil.delete(\"integer:\")\n", - " + delete_space\n", - " + pynutil.delete(\"\\\"\")\n", - " + pynini.closure(NEMO_DIGIT, 1)\n", - " + pynutil.delete(\"\\\"\")\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zKCt_EapZXGW" - }, - "source": [ - "Now we need to manage the `morphosyntactic_features` component. The first steps seem simple enough: delete the property tag and replace the superscript indicators with the actual superscripts. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "yoa_mXMLabrU" - }, - "outputs": [], - "source": [ - " # Create mappings for all superscripts\n", - " superscript = pynini.union(\n", - " pynini.cross(\"e\", \"ᵉ\"), # only delete first quote since there may be more features\n", - " pynini.cross(\"d\", \"ᵈ\"),\n", - " pynini.cross(\"r\", \"ʳ\"),\n", - " pynini.cross(\"s\", \"ˢ\"),\n", - " )\n", - "\n", - " # Append to deletion of feature property. Note that we use plus closure for multiple superscripts.\n", - " graph_morphosyntactic_features = pynutil.delete(\" morphosyntactic_features: \\\"\") + superscript.plus" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xOA7_MsUrSJS" - }, - "source": [ - "### Romanization" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "K_SaG0DUa2t7" - }, - "source": [ - "Now we come to the possible Romanization component. Since we need to graph the superscript components as following the number, we want to design our graph so that `morphosyntactic_features` is the last component of the graph. However, we do not know that we need Romanization until we see the `morphosyntactic_features` component. As such, we need to design our graph such that two options are available initially for an input, but only one allows full traversal." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7dalc-tablG-" - }, - "source": [ - "![romanization.png](images/romanization.PNG)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mPTNCddNcEEE" - }, - "source": [ - "In cases where your WFST decisions are dependent on latter parts of an input string, permitting the union of two separate paths when only one is valid usually assists, as a standard pathing heuristic will only choose the valid path. \n", - "\n", - "In the case of French, this would require us to separate our Verbalizer into two parts: one for Arabic numerals and one for Roman numerals. For the Arabic WFST, we simply conclude the graph. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0YSy1PYOcuyD" - }, - "outputs": [], - "source": [ - "graph_integer = (\n", - " pynutil.delete(\"integer:\")\n", - " + delete_space\n", - " + pynutil.delete(\"\\\"\")\n", - " + pynini.closure(NEMO_DIGIT, 1)\n", - " + pynutil.delete(\"\\\"\")\n", - " )\n", - "graph_Arabic = graph_integer + graph_morphosyntactic_features + pynutil.delete(\"\\\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nnXjUU5Pf7Sh" - }, - "source": [ - "For the Roman graph, things get a bit trickier. Ideally, we would want to build a WFST that maps each digit of `graph_Arabic` to a Roman equivalent. However, consider the following examples:\n", - "- 1 -> I\n", - "- 10 -> X\n", - "- 11 -> XI\n", - "- 100 -> C\n", - "- 101 -> CI\n", - "- 110 -> CX\n", - "- 111 -> CXI\n", - "\n", - "Since Roman numerals do not preserve powers of ten through digit placement, we will need to design separate FSTs for each digit position and apply them accordingly. As this can quickly become intensive, we will only work to enumerate the Ordinals from 1 to 100. (Note: We are doing this to accommodate centuries; there is little likelihood that any century beyond the 99th will be used in regular strings.)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3-fQHMc2iQrz" - }, - "source": [ - "First we design our graphs for converting from Arabic to Roman numerals:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "d6PDySykiXTh" - }, - "outputs": [], - "source": [ - "digits = pynini.string_map([(\"1\", \"I\"),\n", - " (\"2\",\t\"II\"),\n", - " (\"3\",\t\"III\"),\n", - " (\"4\",\t\"IV\"),\n", - " (\"5\",\t\"V\"),\n", - " (\"6\",\t\"VI\"),\n", - " (\"7\",\t\"VII\"),\n", - " (\"8\",\t\"VIII\"),\n", - " (\"9\",\t\"IX\"),\n", - " ])\n", - "tens = pynini.string_map([(\"1\", \"X\"),\n", - " (\"2\",\t\"XX\"),\n", - " (\"3\",\t\"XXX\"),\n", - " (\"4\",\t\"XL\"),\n", - " (\"5\",\t\"L\"),\n", - " (\"6\",\t\"LX\"),\n", - " (\"7\",\t\"LXX\"),\n", - " (\"8\",\t\"LXXX\"),\n", - " (\"9\",\t\"XC\"),\n", - " ])\n", - "zero = pynutil.delete(\"0\") # No Roman representation for zero." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wb-LmwJdk59m" - }, - "source": [ - "Now we build two separate filters: one will accept only single digit Arabic numerals and the other will accept two digit Arabic numerals. For this we can use `NEMO_DIGIT`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DW3oD7Hbli2X" - }, - "outputs": [], - "source": [ - "map_one_digit = NEMO_DIGIT\n", - "map_two_digits = NEMO_DIGIT ** 2 # pynini overloads the exponent function to allow self-concatenation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xtYKLy9AmJZS" - }, - "source": [ - "We now build mappings between two digit Arabic numerals and Roman numerals, composing them onto the filters:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dUy7uEUXmT_g" - }, - "outputs": [], - "source": [ - "graph_one_digit_romans = NEMO_DIGIT @ digits\n", - "\n", - "graph_two_digit_romans = tens + (digits | zero)\n", - "graph_two_digit_romans = map_two_digits @ graph_two_digit_romans\n", - "\n", - "graph_romans = graph_one_digit_romans | graph_two_digit_romans" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JEinyAMdm7RJ" - }, - "source": [ - "We now take care of the occurrence of \"siècle\" before composing onto `graph_integer`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ERO19BbynPNX" - }, - "outputs": [], - "source": [ - "graph_romans = (graph_integer @ graph_romans) + graph_morphosyntactic_features\n", - "graph_romans += pynini.cross(\"/\", \" \") + \"siècle\" + pynutil.delete(\"\\\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zN-fwrCGoToQ" - }, - "source": [ - "We finalize with a union and calling `delete_tokens`, the complete Verbalizer now being::" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kr2wcToAofWB" - }, - "outputs": [], - "source": [ - "class OrdinalFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"ordinal\", kind=\"verbalize\")\n", - "\n", - " # Maps integer and removes attribute\n", - " graph_integer = (\n", - " pynutil.delete(\"integer:\")\n", - " + delete_space\n", - " + pynutil.delete(\"\\\"\")\n", - " + pynini.closure(NEMO_DIGIT, 1)\n", - " + pynutil.delete(\"\\\"\")\n", - " )\n", - "\n", - " # Create mappings for all superscripts\n", - " superscript = pynini.union(\n", - " pynini.cross(\"e\", \"ᵉ\"), # only delete first quote since there may be more features\n", - " pynini.cross(\"d\", \"ᵈ\"),\n", - " pynini.cross(\"r\", \"ʳ\"),\n", - " pynini.cross(\"s\", \"ˢ\"),\n", - " )\n", - "\n", - " # Append to deletion of feature property. Note that we use plus closure for multiple superscripts.\n", - " graph_morphosyntactic_features = pynutil.delete(\" morphosyntactic_features: \\\"\") + superscript.plus\n", - "\n", - " # Writing WFST for Arabic\n", - " graph_Arabic = graph_integer + graph_morphosyntactic_features + pynutil.delete(\"\\\"\")\n", - "\n", - " # Mapping Roman numerals\n", - " digits = pynini.string_map([(\"1\", \"I\"),\n", - " (\"2\",\t\"II\"),\n", - " (\"3\",\t\"III\"),\n", - " (\"4\",\t\"IV\"),\n", - " (\"5\",\t\"V\"),\n", - " (\"6\",\t\"VI\"),\n", - " (\"7\",\t\"VII\"),\n", - " (\"8\",\t\"VIII\"),\n", - " (\"9\",\t\"IX\"),\n", - " ])\n", - " tens = pynini.string_map([(\"1\", \"X\"),\n", - " (\"2\",\t\"XX\"),\n", - " (\"3\",\t\"XXX\"),\n", - " (\"4\",\t\"XL\"),\n", - " (\"5\",\t\"L\"),\n", - " (\"6\",\t\"LX\"),\n", - " (\"7\",\t\"LXX\"),\n", - " (\"8\",\t\"LXXX\"),\n", - " (\"9\",\t\"XC\"),\n", - " ])\n", - " zero = pynutil.delete(\"0\") # No Roman representation for zero.\n", - "\n", - " # filters for Roman digits\n", - " map_one_digit = NEMO_DIGIT\n", - " map_two_digits = NEMO_DIGIT ** 2 # pynini overloads the exponent function to allow self-concatenation.\n", - "\n", - " # Composing onto roman digits\n", - " graph_one_digit_romans = NEMO_DIGIT @ digits\n", - "\n", - " graph_two_digit_romans = tens + (digits | zero)\n", - " graph_two_digit_romans = map_two_digits @ graph_two_digit_romans\n", - "\n", - " graph_romans = graph_one_digit_romans | graph_two_digit_romans\n", - "\n", - " # Writing WFST for Roman\n", - " graph_romans = (graph_integer @ graph_romans) + graph_morphosyntactic_features\n", - " graph_romans += pynini.cross(\"/\", \" \") + \"siècle\" + pynutil.delete(\"\\\"\")\n", - "\n", - " # Final composition\n", - " graph = (graph_romans | graph_Arabic)\n", - "\n", - " delete_tokens = self.delete_tokens(graph)\n", - " self.fst = delete_tokens.optimize()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Trying out our examples:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "example_regular = 'ordinal { integer: \"12\" morphosyntactic_features: \"es\" }'\n", - "example_roman = 'ordinal { integer: \"12\" morphosyntactic_features: \"es/siècle\" }'\n", - "\n", - "fst = OrdinalFst().fst\n", - "\n", - "apply_fst(example_regular, fst)\n", - "apply_fst(example_roman, fst)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yBgLhTq9pWZe" - }, - "source": [ - "We have now completed an Ordinal WFST from the ground up, allowing a separate numbering system for special cases." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-W1-BMVJUXXk" - }, - "source": [ - "## Final notes" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kR7E64P4pPU_" - }, - "source": [ - "Before moving on, there are some key takeaways that you may find useful for most (if not all) languages:\n", - "- Many ordinal systems rely on alteration of Cardinals. Even in the example of Korean, it is using a pre-existing counting system and adding a suffix to indicate ordering. As such, your Ordinal WFST will likely follow this tutorial's structure of changing the Ordinal to its original root and then relying on your Cardinal WFST for the majority of processing.\n", - "- The `morphosyntactic_features` property will carry the vast majority of information necessary for normalization through your Verbalizer.\n", - "- While not all writing systems have the same quirk as using Roman numerals in reference to centuries, you will likely find cases in your language when a specific token indicates unique rules for a semiotic class. Carrying this information to the Verbalizer is usually the simplest means of preserving the token while also facilitating normalization. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Rx8-LuJOUaa5" - }, - "source": [ - "# Decimal WFST " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "D2MRXYxz8TGA" - }, - "source": [ - "\n", - "If the Cardinal WFST is the most crucial element of a normalization grammar, the construction of the Decimal WFST is a close second. Much like in the case of constructing Ordinals from Cardinal grammars, many aspects of the Decimal WFST will be reused throughout your other semiotic classes.\n", - "\n", - "To get started, you should study the numerical conventions in your language. In particular, you should take note of the following:\n", - "- How is the decimal component of a number pronounced in your language of focus. (e.g. The English number `1.33` can be verbalized as \"one point three three\" or \"one and thirty three hundredths.\")\n", - "- What is the punctuation mark used for decimal demarcation? (In North America, several writing systems use `.` while European nations will use `,`.)\n", - "- Are there general rules regarding pronunciation/formatting of numbers past the decimal demarcation? (e.g. Does your language pronounce each digit or pronounce as a series of three digit numbers?)\n", - "\n", - "Such questions will likely require some deep familiarity with the language, and it may benefit to ask a native speaker for some input. Of course, the level of depth is dependent on your needs, but researching these questions will help your normalization system appear more organic." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UsK78ib4N-gb" - }, - "source": [ - "## Grammar" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "p4CLOOA9OAwZ" - }, - "source": [ - "In the case of French, we have the following guidelines:\n", - "- French uses the comma ( `,` ) for decimal delineation. It is articulated as \"virgule\".\n", - "- Decimals can be read as a series of digits or grouped as Cardinal numbers arbitrarily. (e.g. \"`.333` can be \"virgule trois trois trois\" or \"virgule trois-cent-trente-trois\".) \n", - "\n", - "As such, our grammar needs to accommodate the following pattern: \n", - "\n", - "`cardinal + \"virgule\" + string_of_cardinals`\n", - "\n", - "Given our experience with our previous WFSTs, this seems simple enough. We assume we have an instance of CardinalFST available and create a subcomponent to map the integer portion of a decimal:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XSp9FTzhf0XZ" - }, - "outputs": [], - "source": [ - "cardinal = CardinalFst().graph_no_exception # NeMo equivalent of just_cardinals\n", - "\n", - "# place cardinal under closure to permit values <=1\n", - "graph_integer = pynini.closure(cardinal, 0, 1)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bk3_3iawgAZE" - }, - "source": [ - "Compose it on a subcomponent that detects the delineator \"virgule\":" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UMzfAKkngH6z" - }, - "outputs": [], - "source": [ - "delete_virgule = pynutil.delete(\"virgule\")\n", - "graph_decimal = graph_integer + delete_space + delete_virgule" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GXjbtbLYgn17" - }, - "source": [ - "And permit the occurrence of several strings of cardinals to follow:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "LMMNBJz8gtTA" - }, - "outputs": [], - "source": [ - "graph_string_of_cardinals = delete_space + graph_cardinal\n", - "graph_string_of_cardinals = pynini.closure(graph_string_of_cardinals, 1)\n", - "\n", - "graph_decimal += graph_string_of_cardinals" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jTgnRLddhGdE" - }, - "source": [ - "Let us try an example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "D4rjDh0ShJAp" - }, - "outputs": [], - "source": [ - "example = \"trois virgule trois cinquante-cinq\" \n", - "apply_fst(example, graph_decimal) # Should output only the cardinals in the string" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RfD1d9JOioyl" - }, - "source": [ - "### Ambiguity?" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3IaI1mCIe_6i" - }, - "source": [ - "Note that our decision to include multiple strings of cardinals after the decimal marker has introduced some ambiguity into our WFST. Consider if a decimal number was followed by an integer series (e.g. `2.5, 5, 6`). Now what should be an application of one DecimalFST and two applications of a CardinalFST can be interpreted as a single DecimalFST application (e.g. `2.556`). What can be done?\n", - "\n", - "While we will address this in greater depth later (see [Tokenize and Classify](#tokenize-and-classify)), the short answer is that cases such as these must be calibrated according to use and linguistic intuition. As this is an inherent ambiguity in the language and its writing system, we can never truly remove this possibility without restricting our ability to model the language. However, we can rely on a few logical assumptions to guide our decision making:\n", - "- Unless the grammar is deployed in a restrictive setting (e.g. a Financial or environment where strings of numbers are often read in series) it's not likely for a valid string to exhibit this level of ambiguity. Speakers typically try to reduce possible ambiguity in their language production and would likely rephrase to avoid issues such as these. [See Grice's maxims](https://en.wikipedia.org/wiki/Cooperative_principle).\n", - "- While a language may allow a specific string by *rule*, speakers may typically avoid them *in practice* due to conventions or difficulty. In our case, while it may be possible to read `2,100 05` as \"deux virgule dix-mille-cinq\" (\"two point ten-thousand and five\"), it's dubious that a speaker would find such easier to read than \"deux virgule une zéro zéro zéro cinq\". (The place value of large strings tend to take longer to recognize.)\n", - "\n", - "While hardly satisfying, these two points will allow us to dismiss *some* worry. With the former observation being outside our grammar's ability to manage, we accommodate the latter point by using an alternate WFST from our CardinalFST: `numbers_up_to_million`. (To utilize in your own language, create a WFST in the Cardinal class right before building up to `graph_millions`. Again, calling `optimize` is advised.)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "piNe1AWspa4J" - }, - "outputs": [], - "source": [ - "cardinal = CardinalFst().numbers_up_to_million\n", - "\n", - "# place cardinal under closure to permit values <=1\n", - "graph_integer = pynini.closure(cardinal, 0, 1)\n", - "\n", - "delete_virgule = pynutil.delete(\"virgule\")\n", - "graph_decimal = graph_integer + delete_space + delete_virgule\n", - "\n", - "graph_string_of_cardinals = delete_space + cardinal\n", - "graph_string_of_cardinals = pynini.closure(graph_string_of_cardinals, 1)\n", - "\n", - "graph_decimal += graph_string_of_cardinals" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "B1gglt0tfM5V" - }, - "source": [ - "## Classifier" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fVkOWkncgOZc" - }, - "source": [ - "Like with our previous WFSTs, the main duty for the classifier is inserting the necessary properties for the semiotic token. For the `decimal` tag, the following properties are used:\n", - "- `integer_part` - indicates value before decimal marker\n", - "- `fractional_part` - indicates values after the decimal marker\n", - "- `negative` - indicates if value is positive or negative (Optional)\n", - "- `quantity` - designates if decimal is in regards to a specific quantity. (See Quantities.)\n", - "\n", - "We can begin by inserting the `integer_part` around our `cardinal` subcomponent and the `fractional_part` around our `graph_string_of_cardinals`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_zw_cDszh-fB" - }, - "outputs": [], - "source": [ - "graph_integer = pynutil.insert(\"integer_part: \\\"\") + cardinal + pynutil.insert(\"\\\" \")\n", - "graph_fractional = pynutil.insert(\"fractional_part: \\\"\") + graph_string_of_cardinals + pynutil.insert(\"\\\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bxlnn_7tiQMn" - }, - "source": [ - "We then concatenate them together with a component that recognizes and removes the decimal separator." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BxNS9_AwiWHf" - }, - "outputs": [], - "source": [ - "graph_integer_or_none = graph_integer | pynutil.insert(\"integer_part: \\\"0\\\" \", weight=.1) # In cases we don't always have an integer preceding\n", - "graph_decimal_no_sign = graph_integer_or_none + delete_space + pynutil.delete(\"virgule\") + graph_fractional" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "b7uGfsi4i5UI" - }, - "source": [ - "*Note that we allow insertion of 0 if there is no integer to accommodate reading of only decimal values*\n", - "\n", - "Now we allow the possibility of negative values. (Recall French uses \"moins\" to indicate the negative.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VsP79naojQZR" - }, - "outputs": [], - "source": [ - "graph_negative = pynini.cross(\"moins\", \"negative: \\\"-\\\" \") + delete_space\n", - "graph_decimal = graph_negative + graph_decimal_no_sign" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QTcvq5HqllqW" - }, - "outputs": [], - "source": [ - "example = \"moins deux virgule cent-quatre\"\n", - "apply_fst(example, graph_decimal)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FVKuGj_9mZ75" - }, - "source": [ - "Placing within a `DecimalFst` class, we have:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tXwr32ermesp" - }, - "outputs": [], - "source": [ - "class DecimalFst(GraphFst):\n", - " def __init__(self, cardinal: GraphFst):\n", - " super().__init__(name=\"decimal\", kind=\"classify\")\n", - " cardinal = cardinal.numbers_up_to_million\n", - " delete_virgule = pynutil.delete(\"virgule\")\n", - "\n", - " graph_integer = pynutil.insert(\"integer_part: \\\"\") + cardinal + pynutil.insert(\"\\\" \") + delete_space\n", - " graph_integer_or_none = graph_integer | pynutil.insert(\"integer_part: \\\"0\\\" \", weight=.001) # In cases we don't always have an integer preceding\n", - "\n", - " graph_string_of_cardinals = delete_space + cardinal\n", - " graph_string_of_cardinals = pynini.closure(graph_string_of_cardinals, 1)\n", - " graph_fractional = pynutil.insert(\"fractional_part: \\\"\") + graph_string_of_cardinals + pynutil.insert(\"\\\"\")\n", - "\n", - " graph_decimal_no_sign = graph_integer_or_none + pynutil.delete(\"virgule\") + graph_fractional \n", - "\n", - " graph_negative = pynini.cross(\"moins\", \"negative: \\\"-\\\" \") + delete_space\n", - " graph_negative = pynini.closure(graph_negative, 0, 1)\n", - "\n", - " graph_decimal = graph_negative + graph_decimal_no_sign\n", - "\n", - " graph = self.add_tokens(graph_decimal)\n", - " self.fst = graph.optimize()\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gjxI5mEKfHLo" - }, - "source": [ - "### Quantities" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3WuwWPf3py7G" - }, - "source": [ - "Recalling our earlier remarks regarding convention in language use, you may find a need to adjust the DecimalFst when processing specific values. For instance, consider the following equivalencies from English:\n", - "- `1,500,000` = \"one million five hundred thousand\" = \"one point five million\" = `1.5 million`\n", - "- `2,750,000` = \"two million seven hundred and fifty thousand\" = \"two point seven five million\" = `2.75 million`\n", - "\n", - "For large numbers, there is a tendency to use the decimal system as though one is describing a quantity. Notably, there is a minimum value for which this is comfortable. (A speaker of English may say \"three point five trillion\" but \"three point five hundred\" comes off as odd.)\n", - "\n", - "This behavior can occur in other languages. For example, the amount of `$1,500,000` may be read in French as \"une virgule cinq million de dollars\" (\"one point five million dollars\"). " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RgMBIKlYdsGz" - }, - "source": [ - "Our Classifier can be made to accommodate this behavior: we simply need to repeat what we did for `OrdinalFst` and set aside several key terms to trigger our model. For French, we will choose all terms added for values greater than a million. (Chosen empirically.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vEcsUXw5fUEe" - }, - "outputs": [], - "source": [ - "suffix = pynini.union(\n", - " \"million\",\n", - " \"millions\",\n", - " \"milliard\",\n", - " \"milliards\",\n", - " \"billion\",\n", - " \"billions\",\n", - " \"billiard\",\n", - " \"billiards\",\n", - " \"trillion\",\n", - " \"trillions\",\n", - " \"trilliard\",\n", - " \"trilliards\",\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wIIUAsR-fgQA" - }, - "source": [ - "We will then need to use a WFST to graph any numbers the precede these amounts. Note, unlike for our `DecimalFst`, we need to permit cardinals as well as decimals. This is because we want to be able to normalize a phrase like \"three million\" to `3 million` as this will be less obtrusive than `3,000,000`.\n", - "\n", - "As such, we will call a `CardinalFst` and a `DecimalFst` in for `graph_quantities`. Since these are both utilized for our `DecimalFst`, it would be more efficient to just pass them along as function/class variables." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "yern-idtycWg" - }, - "outputs": [], - "source": [ - "def get_quantity(decimal, cardinal_up_to_thousand):\n", - " key_values = pynini.union(\n", - " \"million\",\n", - " \"millions\",\n", - " \"milliard\",\n", - " \"milliards\",\n", - " \"billion\",\n", - " \"billions\",\n", - " \"billiard\",\n", - " \"billiards\",\n", - " \"trillion\",\n", - " \"trillions\",\n", - " \"trilliard\",\n", - " \"trilliards\",\n", - " )\n", - " # The French WFST that this borrows from has not removed leading zeroes yet.\n", - " numbers = cardinal_up_to_thousand @ (\n", - " pynutil.delete(pynini.closure(\"0\")) + pynini.difference(NEMO_DIGIT, \"0\") + pynini.closure(NEMO_DIGIT)\n", - " )\n", - " res = (\n", - " pynutil.insert(\"integer_part: \\\"\")\n", - " + numbers\n", - " + pynutil.insert(\"\\\"\")\n", - " + (\n", - " pynini.union(delete_hyphen, delete_extra_space)\n", - " ) # Can be written either as 'deux-millions' or 'deux millions' depending on whether it registers as a noun or part of cardinal.\n", - " + pynutil.insert(\" quantity: \\\"\")\n", - " + suffix\n", - " + pynutil.insert(\"\\\"\")\n", - " )\n", - " # Union with decimal to permit either a cardinal or decimal representation.\n", - " res |= decimal + delete_extra_space + pynutil.insert(\" quantity: \\\"\") + suffix + pynutil.insert(\"\\\"\")\n", - " return res" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uT4LMo8ADBAq" - }, - "source": [ - "We can now insert this into our Classifier, producing the following:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "d2KrCuyGDLwh" - }, - "outputs": [], - "source": [ - "class DecimalFst(GraphFst):\n", - " def __init__(self, cardinal: GraphFst):\n", - " super().__init__(name=\"decimal\", kind=\"classify\")\n", - " quantities_cardinal = cardinal.graph_hundreds_component_at_least_one_none_zero_digit\n", - " cardinal = cardinal.graph_no_exception\n", - " delete_virgule = pynutil.delete(\"virgule\")\n", - "\n", - " graph_integer = pynutil.insert(\"integer_part: \\\"\") + cardinal + pynutil.insert(\"\\\" \") + delete_space\n", - " graph_integer_or_none = graph_integer | pynutil.insert(\"integer_part: \\\"0\\\" \", weight=.001) # In cases we don't always have an integer preceding\n", - "\n", - " graph_string_of_cardinals = delete_space + cardinal\n", - " graph_string_of_cardinals = pynini.closure(graph_string_of_cardinals, 1)\n", - " graph_fractional = pynutil.insert(\"fractional_part: \\\"\") + graph_string_of_cardinals + pynutil.insert(\"\\\"\")\n", - "\n", - " graph_decimal_no_sign = graph_integer_or_none + delete_virgule + graph_fractional \n", - "\n", - " graph_negative = pynini.cross(\"moins\", \"negative: \\\"-\\\" \") + delete_space\n", - " graph_negative = pynini.closure(graph_negative, 0, 1)\n", - " graph_decimal = graph_negative + graph_decimal_no_sign\n", - "\n", - " # Union default decimal with version that accepts quantities\n", - " graph_decimal |= graph_negative + get_quantity(\n", - " graph_decimal_no_sign, quantities_cardinal\n", - " )\n", - " final_graph = self.add_tokens(graph_decimal)\n", - " self.fst = final_graph.optimize()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cD-eKqO6qTyh" - }, - "outputs": [], - "source": [ - "cardinal = CardinalFst()\n", - "decimal = DecimalFst(cardinal).fst\n", - "example = \"trois virgule cent-quatre billion\"\n", - "apply_fst(example, decimal)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HiSLKF3RfRZA" - }, - "source": [ - "## Verbalizer" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QnkOV5FlteQA" - }, - "source": [ - "As before, the Verbalizer is responsible for removing the formatting and rendering a given token in conventional form. As the process remains similar to Ordinals and Cardinals (deleting strings in a regular matter) we will instead focus on a unique concern for `DecimalFst`: numeral spacing.\n", - "\n", - "For some writing systems, decimal numbers and other strings are typically not written as a single string, instead using punctuation to group numbers for clarity. For example, in the United States, integer digits greater than a thousand are separated by commas for every three digits:\n", - "- `12345.678` -> `12,345.678`\n", - "\n", - "A similar rule occurs in French, save it employs spaces on each side of the decimal marker:\n", - "- `12345,6789` -> `12 345,678 9`" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2h4WQZ1a4Cpc" - }, - "source": [ - "While simple enough, this rule poses a slight complication: it works from the left and right of the decimal separator, whereas WFSTs process linearly from the beginning (or end) of strings. As such we will need to break the formatting rule into two components: one for the integer component and one for the decimal component." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ViOFNdZw4-qu" - }, - "source": [ - "Starting with the integer component, we need our subcomponent to recognize every three digits and insert a space before. We can achieve this with some `graph_utils` helper objects - `NEMO_DIGIT` and `NEMO_NON_BREAKING_SPACE`, which accept all digits and non-breaking spaces, respectively. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Z36be2Vo5VbR" - }, - "outputs": [], - "source": [ - "every_three_digits = NEMO_DIGIT ** 3 # accepts a string of three digits\n", - "space_every_three_integer = pynini.closure(NEMO_NON_BREAKING_SPACE + every_three_digits) # inserts space before every three digits." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RSB2gGH-5vwi" - }, - "source": [ - "However, we cannot let the component insert spaces when there are *only* three digits (e.g. `100`.) As such, we need to make sure the insertion only begins starting from the beginning of a string (e.g. when there is a string between one and three digits.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wfWp3ghH6mDQ" - }, - "outputs": [], - "source": [ - "space_every_three_integer = pynini.closure(NEMO_DIGIT, 1, 3) + space_every_three_integer" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NJrQYSfA6vyu" - }, - "source": [ - "For the case of the decimal spacing, we simply reverse the logic:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vBP6ncTp6yXX" - }, - "outputs": [], - "source": [ - "space_every_three_decimal = pynini.closure(NEMO_NON_BREAKING_SPACE + every_three_digits)\n", - "space_every_three_decimal = space_every_three_decimal + pynini.closure(NEMO_DIGIT, 1, 3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WRXPN_gk69VV" - }, - "source": [ - "Placed into our Verbalizer, we would see the following:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "h49eztvs7BXH" - }, - "outputs": [], - "source": [ - "class DecimalFst(GraphFst):\n", - " \"\"\"\n", - " Finite state transducer for verbalizing decimal, e.g.\n", - " decimal { negative: \"true\" integer_part: \"12\" fractional_part: \"5006\" quantity: \"billion\" } -> -12.5006 billion\n", - " \"\"\"\n", - "\n", - " def __init__(self):\n", - " super().__init__(name=\"decimal\", kind=\"verbalize\")\n", - "\n", - " # Need parser to group digits by threes\n", - " exactly_three_digits = NEMO_DIGIT ** 3\n", - " at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3)\n", - "\n", - " space_every_three_integer = (\n", - " at_most_three_digits + (pynutil.insert(NEMO_NON_BREAKING_SPACE) + exactly_three_digits).closure()\n", - " )\n", - " space_every_three_decimal = (\n", - " pynini.accep(\",\")\n", - " + (exactly_three_digits + pynutil.insert(NEMO_NON_BREAKING_SPACE)).closure()\n", - " + at_most_three_digits\n", - " )\n", - " group_by_threes = space_every_three_integer | space_every_three_decimal\n", - " self.group_by_threes = group_by_threes\n", - "\n", - " optional_sign = pynini.closure(pynini.cross(\"negative: \\\"true\\\"\", \"-\") + delete_space, 0, 1)\n", - " integer = (\n", - " pynutil.delete(\"integer_part:\")\n", - " + delete_space\n", - " + pynutil.delete(\"\\\"\")\n", - " + pynini.closure(NEMO_NOT_QUOTE, 1)\n", - " + pynutil.delete(\"\\\"\")\n", - " )\n", - " integer = integer @ group_by_threes\n", - " optional_integer = pynini.closure(integer + delete_space, 0, 1)\n", - " fractional = (\n", - " pynutil.insert(\",\")\n", - " + pynutil.delete(\"fractional_part:\")\n", - " + delete_space\n", - " + pynutil.delete(\"\\\"\")\n", - " + pynini.closure(NEMO_NOT_QUOTE, 1)\n", - " + pynutil.delete(\"\\\"\")\n", - " )\n", - " fractional = fractional @ group_by_threes\n", - " optional_fractional = pynini.closure(fractional + delete_space, 0, 1)\n", - " quantity = (\n", - " pynutil.delete(\"quantity:\")\n", - " + delete_space\n", - " + pynutil.delete(\"\\\"\")\n", - " + pynini.closure(NEMO_NOT_QUOTE, 1)\n", - " + pynutil.delete(\"\\\"\")\n", - " )\n", - " optional_quantity = pynini.closure(pynutil.insert(\" \") + quantity + delete_space, 0, 1)\n", - " graph = (optional_integer + optional_fractional + optional_quantity).optimize()\n", - " self.numbers = graph # Saving just the part of the graph used for numbers\n", - " graph = optional_sign + graph\n", - " delete_tokens = self.delete_tokens(graph)\n", - " self.fst = delete_tokens.optimize()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Trying out some examples:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fst = DecimalFst().fst\n", - "\n", - "example1 = 'decimal { integer_part: \"3\" fractional_part: \"10453\" quantity: \"billion\" }'\n", - "example2 = 'decimal { integer_part: \"22323\" fractional_part: \"104553\" }'\n", - "\n", - "apply_fst(example1, fst)\n", - "apply_fst(example2, fst)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CZbshZCW8clI" - }, - "source": [ - "# Money WFST " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xuiv8HMz7yjm" - }, - "source": [ - "Now that we've handled some of the foundational classes, it's time to see how they build up to permit more concrete ones. Let's see how the previous WFSTs assist in building a WFST for normalizing currency: the `MoneyFst`. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wTU2c7MtUpqF" - }, - "source": [ - "## Grammar" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qqyRm8Ru8TDf" - }, - "source": [ - "While the exact phrasing will vary, a valid string for currency will possess the following qualities:\n", - "- A major and/or minor denomination of currency\n", - "- A numeric quantity of the denomination \n", - "\n", - "As our `CardinalFst` and `OrdinalFst` already allow us to normalize the quantity, the only issue for `MoneyFst` is to graph the amounts and build a vocabulary to recognize the denominations.\n", - "\n", - "For French, we will use the following examples to build upon:\n", - "- \"une euros\" -> `1 €`\n", - "- \"deux euros\" -> `2 €` \n", - "- \"deux euros cinq\" -> `2,5 €` \n", - "- \"cinq centimes\" -> `0,5 €`\n", - "- \"deux billions de euros\" -> `2 billions de euros`" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FMqUir9n9_cA" - }, - "source": [ - "These suggest the following requirements of our grammar:\n", - "- There must be a mapping between \"euro\" and \"centime\" and `€` in our vocabulary\n", - "- This mapping must allow both singular and plural forms\n", - "- The currency denomination is phrased between major and minor denominations (\"une euro cinq\" and not \"une cinq euro\")\n", - "- Large quantities of currency are left 'as is' instead of normalized\n", - "\n", - "We may deal with the vocabulary in the typical fashion:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XN9nbNhB-vEV" - }, - "outputs": [], - "source": [ - "major_currency = pynini.string_map([(\"euro\", \"€\")])\n", - "minor_currency = pynini.string_map([(\"centime\", \"€\")])\n", - "\n", - "graph_plural = pynutil.delete(\"s\").ques\n", - "\n", - "major_currency += graph_plural\n", - "minor_currency += graph_plural" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3aHrm1qPAc-f" - }, - "source": [ - "Moving to the numbers, note that we need to append a leading zero to the value of fractional currency amounts (\"five cents\" -> `$0.05`). We bring back the subgraph from `CardinalFst` that maps tokens to numbers without tokenization to assist with this:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jwi-yQW1AjvG" - }, - "outputs": [], - "source": [ - "from nemo_text_processing.inverse_text_normalization.fr.taggers import cardinal\n", - "\n", - "cardinal_graph = cardinal.CardinalFst()\n", - "graph_cardinal = cardinal_graph.graph_no_exception # graphs cardinals w/o tokenization\n", - "\n", - "add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert(\"0\") + NEMO_DIGIT)\n", - "graph_fractional_values = graph_cardinal @ add_leading_zero_to_double_digit" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, let us consider how to manage arge quantities of currency. In our example (\"deux billions de euros\" -> `2 billions de euros`) we see that its behavior mirrors that of our `get_quantity` portion of `DecimalFst`. As such, it would be useful if there was a subcomponent of that graph that we could use in here. Like in the case of `CardinalFst`, let us go back and create a subgraph for later use. Since all our quantities are positive, this would be best accomplished right before incorporating the `negative` property, creating a `self.final_graph_wo_negative`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class DecimalFst(GraphFst):\n", - " def __init__(self, cardinal: GraphFst):\n", - " super().__init__(name=\"decimal\", kind=\"classify\")\n", - " quantities_cardinal = cardinal.graph_hundreds_component_at_least_one_none_zero_digit\n", - " cardinal = cardinal.graph_no_exception\n", - " delete_virgule = pynutil.delete(\"virgule\")\n", - "\n", - " graph_integer = pynutil.insert(\"integer_part: \\\"\") + cardinal + pynutil.insert(\"\\\" \") + delete_space\n", - " graph_integer_or_none = graph_integer | pynutil.insert(\"integer_part: \\\"0\\\" \", weight=.001) # In cases we don't always have an integer preceding\n", - "\n", - " graph_string_of_cardinals = delete_space + cardinal\n", - " graph_string_of_cardinals = pynini.closure(graph_string_of_cardinals, 1)\n", - " graph_fractional = pynutil.insert(\"fractional_part: \\\"\") + graph_string_of_cardinals + pynutil.insert(\"\\\"\")\n", - "\n", - " graph_decimal_no_sign = graph_integer_or_none + delete_virgule + graph_fractional \n", - "\n", - " ### NEW GRAPH HERE\n", - " self.final_graph_wo_negative = graph_decimal_no_sign | get_quantity(\n", - " final_graph_wo_sign, cardinal.graph_hundreds_component_at_least_one_none_zero_digit\n", - " )\n", - " \n", - " graph_negative = pynini.cross(\"moins\", \"negative: \\\"-\\\" \") + delete_space\n", - " graph_negative = pynini.closure(graph_negative, 0, 1)\n", - " graph_decimal = graph_negative + graph_decimal_no_sign\n", - "\n", - " # Union default decimal with version that accepts quantities\n", - " graph_decimal |= graph_negative + get_quantity(\n", - " graph_decimal_no_sign, quantities_cardinal\n", - " )\n", - " final_graph = self.add_tokens(graph_decimal)\n", - " self.fst = final_graph.optimize()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Allowing us to change our grammar to:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from nemo_text_processing.inverse_text_normalization.fr.taggers import cardinal, decimal\n", - "\n", - "cardinal_graph = cardinal.CardinalFst()\n", - "decimal_graph = decimal.DecimalFst(cardinal_graph)\n", - "\n", - "graph_cardinal = cardinal_graph.graph_no_exception # graphs cardinals w/o tokenization\n", - "graph_decimal = decimal_graph.final_graph_wo_negative # graphs positive decimals w/o tokenization\n", - "\n", - "add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert(\"0\") + NEMO_DIGIT)\n", - "graph_fractional_values = graph_cardinal @ add_leading_zero_to_double_digit" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L1RHoW-TLzIz" - }, - "source": [ - "Note that by doing this, we're also incorporating the formatting from the `decimal` class up to this point. Since these overlap with the `money` class (see next section), we have saved ourselves some work. \n", - "\n", - "Since we already made `graph_quantity` part of our `DecimalFst`, we can avoid dealing with large quantities now. However, this does mean we still need a way to leave currencies 'as is' without normalization. We can do this by using the `project` method, which will create a WFST that excepts either all valid inputs or all valid outputs of another WFST (depending on argument)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7l_TLtJkMluU" - }, - "outputs": [], - "source": [ - "major_currency_no_normalize = major_currency.project(\"input\")\n", - "apply_fst(\"euro\", major_currency_no_normalize)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "raBdHc_WXEpG" - }, - "source": [ - "We then append this WFST with a WFST that recognizes prepositions commonly used before large values of currency (\"d'\", \"des\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "CEuxiVgDXRBf" - }, - "outputs": [], - "source": [ - "graph_preposition = pynini.union(\"des \", \"d'\") # Used for large amounts (billions de euros)\n", - "major_currency_no_normalize = pynini.closure(graph_preposition, 0, 1) + major_currency.project(\"input\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FlXmf8Fq_Rm1" - }, - "source": [ - "## Classifier" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T5BBuQRzLuXS" - }, - "source": [ - "For the Money semiotic class, we have available the following properties for tokenization:\n", - "- `integer_part`\n", - "- `fractional_part` \n", - "- `currency`\n", - "\n", - "Laying the initial groundwork seems simple enough. We first instantiate our `MoneyFst` classifier with our initial grammars:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EZaCeHcFWVP3" - }, - "outputs": [], - "source": [ - "class MoneyFst(GraphFst):\n", - " def __init__(self, cardinal: GraphFst, decimal: GraphFst):\n", - " super().__init__(name=\"money\", kind=\"classify\")\n", - " major_currency = pynini.string_map([(\"euro\", \"€\")])\n", - " minor_currency = pynini.string_map([(\"centime\", \"€\")])\n", - "\n", - " graph_plural = pynutil.delete(\"s\").ques\n", - "\n", - " major_currency += graph_plural\n", - " minor_currency += graph_plural\n", - "\n", - " major_currency_no_normalize = major_currency.project(\"input\")\n", - " graph_preposition = pynini.union(\"des \", \"d'\") # Used for large amounts (billions de euros)\n", - " major_currency_no_normalize = graph_preposition + major_currency.project(\"input\")\n", - "\n", - " graph_cardinal = cardinal.graph_no_exception\n", - " graph_decimal = decimal.final_graph_wo_negative\n", - "\n", - " add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert(\"0\") + NEMO_DIGIT)\n", - " graph_fractional_values = graph_cardinal @ add_leading_zero_to_double_digit" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_bpkXroLWaBo" - }, - "source": [ - "Let us now manage the `currency` property. We have the following scenarios to consider:\n", - "- Major denomination only\n", - "- Minor denomination only\n", - "- Major denomination and implicit minor denomination (\"cinq euro trois\")\n", - "- Major denomination and explicit minor denomination (\"cinq euros et trois centimes\")\n", - "- Large quantities of euros (\"cinq billion des euros\")\n", - "\n", - "Note how across cases the use of `graph_cardinal` and `graph_decimal` will be applied differently. Further, we may have varying orders in which tags are assigned proper values. For instance, if we have only minor denomination we would assign `fractional_part` before `currency`. Meanwhile, major denomination and implicit minor denomination would be the order of `integer_part`, `currency`, `fractional_part`. While we could try and figure out a way to preserve order, recall that the use of permutations in NeMo ITN makes that unnecessary: we can assume the desired order of tags reach our Verbalizer without make overt efforts in our Classifier! \n", - "\n", - "For example, let's say we need to process \"five dollars\" as `$5.00`. Processed linearly, we could get a token sequence along the lines of: `{ integer_part: \"5\" currency: \"$\" }`. If we passed this token array straight to a Verbalizer, we would need to configure a graph that effectively reverses the order so we could parse the `currency` field prior to the `integer_part` field, perhaps something along the lines of: \n", - "\n", - "`pynutil.insert(\"$\") + delete_space + pynutil.delete('integer_part: \\\"') +.... + pynutil.delete('currency: \"$\"')`\n", - "\n", - "But since NeMo creates permutations of our Classifier outputs, this is unnecessary. We can simply assume whatever would be the most convenient order for us (e.g. `{ currency: \"$\" integer_part: \"5\" }`) and build our Verbalizer around that:\n", - "\n", - "`pynutil.delete('currency: \\\"') + NEMO_SIGMA + pynutil.delete('\\\" integer_part: \\\"') + NEMO_DIGIT +...`\n", - "\n", - "Along with helping to keep our script simpler (we can focus simply on tokenization and not worry about what input order our Verbalizers will accept), this also allows us to overcome structural constraints of WFSTs, namely that they are [limited in reordering text strings](https://en.wikipedia.org/wiki/Pushdown_automaton)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fMZ13D2Dh9ZF" - }, - "source": [ - "Keeping this in mind, let's begin mapping the proper tags. Since they're relatively simple, we can start with only major and minor denominations:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EtwWLp7VbbjM" - }, - "outputs": [], - "source": [ - "graph_integer_component = pynutil.insert(\"integer_part: \\\"\") + graph_cardinal + pynutil.insert(\"\\\"\")\n", - "graph_fractional_component = pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\")\n", - "\n", - "graph_major_currency = pynutil.insert(\" currency: \\\"\") + major_currency + pynutil.insert(\"\\\"\")\n", - "graph_minor_currency = pynutil.insert(\" currency: \\\"\") + minor_currency + pynutil.insert(\"\\\"\")\n", - "\n", - "graph_only_major_money = graph_integer_component + delete_space + graph_major_currency\n", - "graph_only_minor_money = graph_fractional_component + delete_space + graph_minor_currency " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XTmxrK4DmS39" - }, - "source": [ - "Now we may append the case of an implicit `fractional_part` to `graph_only_major_money`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Zvzn3pQinkT0" - }, - "outputs": [], - "source": [ - "implicit_fractional_part = delete_space + pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\") \n", - "implicit_fractional_part = pynini.closure(implicit_fractional_part, 0, 1) " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tKFZkCVmn1OX" - }, - "source": [ - "And the explicit fractional portion:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "d_h0pTlMn3jz" - }, - "outputs": [], - "source": [ - "delete_et = pynutil.delete(\"et \") # Sometimes prefaces the minor currency\n", - "delete_et = pynini.closure(delete_et, 0 , 1)\n", - "\n", - "delete_minor = pynutil.delete(minor_currency.project(\"input\")) # to remove the minor currency\n", - "\n", - "explicit_fractional_part = pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\") \n", - "explicit_fractional_part = delete_space + delete_et + explicit_fractional_part + delete_space + delete_minor\n", - "explicit_fractional_part = pynini.closure(explicit_fractional_part, 0, 1)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rvnpAudgo-o3" - }, - "source": [ - "We join them together:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qYzlIRWTpD8e" - }, - "outputs": [], - "source": [ - "graph_major_money = graph_only_major_money + (implicit_fractional_part | explicit_fractional_part)\n", - "graph_standard_money = graph_major_money | graph_only_minor_money" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TzeaKXVzpYs8" - }, - "source": [ - "Finishing with the case the large quantities of money, we need to use `graph_decimal` so we can exploit its ability to map quantities. Note that since we are using a pre-existing WFST, we can ignore inserting the tags ourselves, since this is already done by the Decimal WFST. As long as we remember to process this aspect with our Verbalizer, we can spare ourselves the extra step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "LnqX9mGFpmJm" - }, - "outputs": [], - "source": [ - "graph_large_money = pynutil.insert(\" currency: \\\"\") + major_currency_no_normalize + pynutil.insert(\"\\\"\")\n", - "graph_large_money = graph_decimal + delete_space + graph_large_money" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "24TUZnJKqgPA" - }, - "source": [ - "Alltogether, this would give the following Classifier:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "B7-muCO2qizg" - }, - "outputs": [], - "source": [ - "class MoneyFst(GraphFst):\n", - " def __init__(self, cardinal: GraphFst, decimal: GraphFst):\n", - " super().__init__(name=\"money\", kind=\"classify\")\n", - " major_currency = pynini.string_map([(\"euro\", \"€\")])\n", - " minor_currency = pynini.string_map([(\"centime\", \"€\")])\n", - "\n", - " graph_plural = pynutil.delete(\"s\").ques\n", - "\n", - " major_currency += graph_plural\n", - " minor_currency += graph_plural\n", - "\n", - " major_currency_no_normalize = major_currency.project(\"input\")\n", - " graph_preposition = pynini.union(\"des \", \"d'\") # Used for large amounts (billions de euros)\n", - " major_currency_no_normalize = graph_preposition + major_currency.project(\"input\")\n", - "\n", - " graph_cardinal = cardinal.graph_no_exception\n", - " graph_decimal = decimal.final_graph_wo_negative\n", - "\n", - " add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert(\"0\") + NEMO_DIGIT)\n", - " graph_fractional_values = graph_cardinal @ add_leading_zero_to_double_digit\n", - "\n", - " graph_integer_component = pynutil.insert(\"integer_part: \\\"\") + graph_cardinal + pynutil.insert(\"\\\"\")\n", - " graph_fractional_component = pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\")\n", - "\n", - " graph_major_currency = pynutil.insert(\" currency: \\\"\") + major_currency + pynutil.insert(\"\\\"\")\n", - " graph_minor_currency = pynutil.insert(\" currency: \\\"\") + minor_currency + pynutil.insert(\"\\\"\")\n", - "\n", - " graph_only_major_money = graph_integer_component + delete_space + graph_major_currency\n", - " graph_only_minor_money = graph_fractional_component + delete_space + graph_minor_currency \n", - "\n", - " implicit_fractional_part = delete_space + pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\") \n", - " implicit_fractional_part = pynini.closure(implicit_fractional_part, 0, 1) \n", - "\n", - "\n", - " delete_et = pynutil.delete(\"et \") # Sometimes prefaces the minor currency\n", - " delete_et = pynini.closure(delete_et, 0 , 1)\n", - "\n", - " delete_minor = pynutil.delete(minor_currency.project(\"input\")) # to remove the minor currency\n", - "\n", - " explicit_fractional_part = pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\") \n", - " explicit_fractional_part = delete_space + delete_et + explicit_fractional_part + delete_space + delete_minor\n", - " explicit_fractional_part = pynini.closure(explicit_fractional_part, 0, 1)\n", - "\n", - " graph_major_money = graph_only_major_money + (implicit_fractional_part | explicit_fractional_part)\n", - "\n", - " graph_large_money = pynutil.insert(\" currency: \\\"\") + major_currency_no_normalize + pynutil.insert(\"\\\"\")\n", - " graph_large_money = graph_decimal + delete_space + graph_large_money\n", - "\n", - " final_graph = graph_large_money | graph_major_money | graph_only_minor_money\n", - "\n", - " final_graph = self.add_tokens(final_graph)\n", - " self.fst = final_graph.optimize()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's see the results:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from nemo_text_processing.inverse_text_normalization.fr.taggers import decimal, cardinal\n", - "\n", - "cardFst = cardinal.CardinalFst()\n", - "decFst = decimal.DecimalFst(cardFst)\n", - "\n", - "moneyFst = MoneyFst(cardFst, decFst).fst\n", - "\n", - "example = \"douze virgule cinq billions d'euros\"\n", - "\n", - "apply_fst(example, moneyFst)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gxdcyuLmAZZa" - }, - "source": [ - "## Verbalizer" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZZFDWNwY6sOG" - }, - "source": [ - "By this point, the creation of the Verbalizer should be rather straight-forward - delete the expected tokens and perform any specific formatting that was not caught by the Classifier. \n", - "\n", - "In fact, it is so straight-forward that much of the work does not even need to be explicitly managed by the Verbalizer. As mentioned previously, two of the properties we inserted in our Classifier where already referenced in our `DecimalFst` - `integer_part` and `fractional_part`. We even went so far to directly call a component of `DecimalFst` in our Classifier. As such, outside of the `currency` property - there is little in our Money token that is different from a standard Decimal token. Indeed, even the normalized forms are similar (`200,5` vs. `200,5 €`.) " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T7sgH0t79tmU" - }, - "source": [ - "Given these similarities, it seems that we can save ourselves some work and simply use the Decimal Verbalizer to manage much of the normalization. Let's look at the basic format of our `MoneyFst` verbalizer, writing it so it accepts a `DecimalFst` as input:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BEu8nITP9mSG" - }, - "outputs": [], - "source": [ - "class MoneyFst(GraphFst):\n", - " def __init__(self, decimal: GraphFst):\n", - " super().__init__(name=\"money\", kind=\"verbalize\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JYVLou5N-Dk8" - }, - "source": [ - "We manage the issue of deleting the `currency` property:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "LO35tJ7G-H6N" - }, - "outputs": [], - "source": [ - "class MoneyFst(GraphFst):\n", - " def __init__(self, decimal: GraphFst):\n", - " super().__init__(name=\"money\", kind=\"verbalize\")\n", - " unit = (\n", - " pynutil.delete(\"currency:\")\n", - " + delete_extra_space\n", - " + pynutil.delete(\"\\\"\")\n", - " + pynini.closure(NEMO_NOT_QUOTE, 1)\n", - " + pynutil.delete(\"\\\"\")\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bDS8XSII-Dpd" - }, - "source": [ - "Now consider, we need to normalize an integer component, a fractional component, and a decimal to separate them. Since NeMo will automatically permutate all tags, we can assume whatever order we want. As such, we can assume we get the exact order that is accepted by our `DecimalFst`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VtGfpjVA-r3u" - }, - "outputs": [], - "source": [ - " def __init__(self, decimal: GraphFst):\n", - " super().__init__(name=\"money\", kind=\"verbalize\")\n", - " unit = (\n", - " pynutil.delete(\"currency:\")\n", - " + delete_extra_space\n", - " + pynutil.delete(\"\\\"\")\n", - " + pynini.closure(NEMO_NOT_QUOTE, 1)\n", - " + pynutil.delete(\"\\\"\")\n", - " )\n", - " graph = decimal.numbers + delete_space + unit\n", - " delete_tokens = self.delete_tokens(graph)\n", - " self.fst = delete_tokens.optimize()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZefxZLIU-uRU" - }, - "source": [ - "It is as simple and compact as appending the `unit` component to the preexisting `decimal.numbers`. \n", - "\n", - "This feature is worth keeping in mind as you build up to more concrete classes: the combination of guaranteed tag permutations and prebuilt Verbalizers make the addition of semiotic classes progressively simpler despite the building complexity of your entire grammar." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WydC7Cn28l5Y" - }, - "source": [ - "# Time WFST " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VelunbumCJJe" - }, - "source": [ - "Our next composite graph will be for the Time WFST. Here, you may see more variation between your language and our example than with our previous classes. This is for a number of reasons, among them being that while there may be some standard cross linguistic patterns regarding time (e.g. `quantity_of_hours + quantity_of_minutes`), the use of various equivalent phrases can make an exhaustive grammar incredibly specific (e.g. consider managing \"twelve fifteen\", \"twelve and a quarter\", \"quarter past twelve\", \"quarter after twelve\", and \"forty five until one\" all together). You may find yourself drawing upon WFSTs that accommodate Cardinals, Fractions, and some basic subtraction.\n", - "\n", - "As such, we are going to focus on those aspects of the Time WFST that are necessary for a functional normalization of time related phrases, saving a more exhaustive grammar for your own specific languages and use cases." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8wqb28wzATOR" - }, - "source": [ - "## Grammar" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AVntDM3AEz0v" - }, - "source": [ - "For our Time WFST, we will focus on the following aspects:\n", - "- Use of 24 or 12 hour base\n", - "- Use of fraction terminology (e.g. \"quarter\" = `15`)\n", - "- Accommodation of key-words (\"noon\", \"midnight\")\n", - "- Counting backwards from the hour (\"ten to five\", \"five to three\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "seU9hTbgFgu7" - }, - "source": [ - "We'll start with the basic system.\n", - "\n", - "For French, time operates on a twenty-four hour system, with the zeroth hour being midnight. Time is given in the following format:\n", - "\n", - "`cardinal + heure(s) + (cardinal)` \n", - "\n", - "This is normalized as:\n", - "\n", - "`cardinal h (cardinal)`\n", - "\n", - "For instance, for `3:03`, we would have:\n", - "- input: \"trois heures trois\"\n", - "- output: `3 h 03`\n", - "\n", - "As such, our grammar needs to utilize a Cardinal WFST and have a means to accept \"heures\" from the input. Taking care of the latter case is simple enough:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "HTSVxf4fI_ND" - }, - "outputs": [], - "source": [ - "graph_heures = pynini.accep(\"heure\") + pynini.accep(\"s\").ques\n", - "graph_heures = pynutil.delete(graph_heures)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6LW7pXaXJSZa" - }, - "source": [ - "For the cardinals, we could pass an instance of `CardinalFST` to our graph. But do we really need that level of coverage? We only really need to cover the numbers 0 - 60, which we could simply write a new WFST for. Further, it may be beneficial to allow our graph to separate possible ambiguity. While we will not cover it in our tutorial, you may in the future find it necessary to build a WFST for Measurements, of which quantities of time may play a part. Would it not be helpful for you WFST to know that \"thirty hours\" could only ever be a measurement instead of a possible time of day?\n", - "\n", - "Given the little amount of effort necessary and the quick benefit, we choose to make our hours and minutes explicit in the Time WFST." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R4aa06ZPLKIR" - }, - "outputs": [], - "source": [ - "hours = pynini.string_map([\n", - " (\"zéro\",\"0\"),\n", - " (\"une\",\"1\"),\n", - " (\"deux\",\"2\"),\n", - " (\"trois\",\"3\"),\n", - " (\"quatre\",\"4\"),\n", - " (\"cinq\",\"5\"),\n", - " (\"six\",\"6\"),\n", - " (\"sept\",\"7\"),\n", - " (\"huit\",\"8\"),\n", - " (\"neuf\",\"9\"),\n", - " (\"dix\",\"10\"),\n", - " (\"onze\",\"11\"),\n", - " (\"douze\",\"12\"),\n", - " (\"treize\",\"13\"),\n", - " (\"quatorze\",\"14\"),\n", - " (\"quinze\",\"15\"),\n", - " (\"seize\",\"16\"),\n", - " (\"dix-sept\",\"17\"),\n", - " (\"dix-huit\",\"18\"),\n", - " (\"dix-neuf\",\"19\"),\n", - " (\"vingt\",\"20\"),\n", - " (\"vingt-et-une\",\"21\"),\n", - " (\"vingt et une\",\"21\"),\n", - " (\"vingt-deux\",\"22\"),\n", - " (\"vingt-trois\",\"23\"),\n", - " (\"vingt-quatre\",\"24\"),\n", - "])\n", - "minutes = pynini.string_map([\n", - " (\"une\", \"01\"),\n", - " (\"deux\", \"02\"),\n", - " (\"trois\", \"03\"),\n", - " (\"quatre\", \"04\"),\n", - " (\"cinq\", \"05\"),\n", - " (\"six\", \"06\"),\n", - " (\"sept\", \"07\"),\n", - " (\"huit\", \"08\"),\n", - " (\"neuf\", \"09\"),\n", - " (\"dix\", \"10\"),\n", - " (\"onze\", \"11\"),\n", - " (\"douze\", \"12\"),\n", - " (\"treize\", \"13\"),\n", - " (\"quatorze\", \"14\"),\n", - " (\"quinze\", \"15\"),\n", - " (\"seize\", \"16\"),\n", - " (\"dix-sept\", \"17\"),\n", - " (\"dix-huit\", \"18\"),\n", - " (\"dix-neuf\", \"19\"),\n", - " (\"vingt\", \"20\"),\n", - " (\"vingt-et-une\", \"21\"),\n", - " (\"vingt et une\", \"21\"),\n", - " (\"vingt-deux\", \"22\"),\n", - " (\"vingt-trois\", \"23\"),\n", - " (\"vingt-quatre\", \"27\"),\n", - " (\"vingt-cinq\", \"25\"),\n", - " (\"vingt-six\", \"26\"),\n", - " (\"vingt-sept\", \"27\"),\n", - " (\"vingt-huit\", \"28\"),\n", - " (\"vingt-neuf\", \"29\"),\n", - " (\"trente\", \"30\"),\n", - " (\"trente-et-une\", \"31\"),\n", - " (\"trente et une\", \"31\"),\n", - " (\"trente-deux\", \"32\"),\n", - " (\"trente-trois\", \"33\"),\n", - " (\"trente-quatre\", \"34\"),\n", - " (\"trente-cinq\", \"35\"),\n", - " (\"trente-six\", \"36\"),\n", - " (\"trente-sept\", \"37\"),\n", - " (\"trente-huit\", \"38\"),\n", - " (\"trente-neuf\", \"39\"),\n", - " (\"quarante\", \"40\"),\n", - " (\"quarante-et-une\", \"41\"),\n", - " (\"quarante et une\", \"41\"),\n", - " (\"quarante-deux\", \"42\"),\n", - " (\"quarante-trois\", \"43\"),\n", - " (\"quarante-quatre\", \"44\"),\n", - " (\"quarante-cinq\", \"45\"),\n", - " (\"quarante-six\", \"46\"),\n", - " (\"quarante-sept\", \"47\"),\n", - " (\"quarante-huit\", \"48\"),\n", - " (\"quarante-neuf\", \"49\"),\n", - " (\"cinquante\", \"50\"),\n", - " (\"cinquante-et-une\", \"51\"),\n", - " (\"cinquante et une\", \"51\"),\n", - " (\"cinquante-deux\", \"52\"),\n", - " (\"cinquante-trois\", \"53\"),\n", - " (\"cinquante-quatre\", \"54\"),\n", - " (\"cinquante-cinq\", \"55\"),\n", - " (\"cinquante-six\", \"56\"),\n", - " (\"cinquante-sept\", \"57\"),\n", - " (\"cinquante-huit\", \"58\"),\n", - " (\"cinquante-neuf\", \"59\"),\n", - "])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4SmNsNKLM9cC" - }, - "source": [ - "Now that we've managed the basic graph, we can address some of the more niche rules of French timekeeping.\n", - "\n", - "To start, French employs some colloquialisms that will be familiar to English speakers: minutes that are multiples of fifteen are referred to as fractions of a clock. In particular:\n", - "- `5 h 15` -> \"cinq heures **et quart**\"\n", - "- `5 h 30` -> \"cinq heures **et demie**\"\n", - "- `5 h 45` -> \"cinq eures **et trois quarts**\"\n", - "\n", - "We thus need a means of rendering these as their numerical equivalents:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xHe3nfrpSlrE" - }, - "outputs": [], - "source": [ - "# Mapping 'et demi' and 'et qart'\n", - "graph_et = pynutil.delete(\"et\") + delete_space\n", - "\n", - "graph_demi = pynini.accep(\"demi\")\n", - "graph_demi += pynini.accep(\"e\").ques # people vary on feminine or masculine form\n", - "graph_demi = pynini.cross(graph_demi, \"30\")\n", - "\n", - "graph_quart = pynini.accep('quart')\n", - "graph_quart = pynini.cross(graph_quart, '15')\n", - "graph_trois_quart = pynini.cross(\"trois quarts\", \"45\")\n", - "\n", - "graph_fractions = graph_demi | graph_quart | graph_trois_quart\n", - "graph_fractions = graph_et + graph_fractions" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HD2wobIQS3fX" - }, - "source": [ - "Also like English, French will use key words to designate a specific timeslot. Noon and midnight are \"midi\" and \"minuit\" respectively." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ahbkiZFuTN2t" - }, - "outputs": [], - "source": [ - "# Midi and minuit\n", - "graph_midi = pynini.cross(\"midi\", \"12\")\n", - "graph_minuit = pynini.cross(\"minuit\", \"0\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6OyMoqfZTX1U" - }, - "source": [ - "Now it's time to throw a wrench into things: counting backwards from the hour. How are we to get what is essentially a graph to do the subtraction necessarily for \"ten to twelve\" to become `11:50`?\n", - "\n", - "Easy: we build the subtraction into the graph itself. That is, we map the hours and minutes produced by our graph onto another graph that produces their amount shifted back a value.\n", - "\n", - "Let's take our \"ten to twelve\" example. Normally \"ten\" would map to `10` and \"twelve\" to `12`. But with these new graphs, the detection of the pattern `minute + to + hour` would signal that `10` should now become `50` and `12` become `11`." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uMWifbm1VQjP" - }, - "source": [ - "Let us do this for our French example. Luckily enough, the indication that a French string is regular: counting backwards from the hour is by use of the pattern `cardinal + heures + moins + minutes`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c4bV3T1pViCH" - }, - "outputs": [], - "source": [ - "hours_to = pynini.string_map([\n", - " (\"1\",\"0\"),\n", - " (\"2\",\"1\"),\n", - " (\"3\",\"2\"),\n", - " (\"4\",\"3\"),\n", - " (\"5\",\"4\"),\n", - " (\"6\",\"5\"),\n", - " (\"7\",\"6\"),\n", - " (\"8\",\"7\"),\n", - " (\"9\",\"8\"),\n", - " (\"10\",\"9\"),\n", - " (\"11\",\"10\"),\n", - " (\"12\",\"11\"),\n", - " (\"13\",\"12\"),\n", - " (\"14\",\"13\"),\n", - " (\"15\",\"14\"),\n", - " (\"16\",\"15\"),\n", - " (\"17\",\"16\"),\n", - " (\"18\",\"17\"),\n", - " (\"19\",\"18\"),\n", - " (\"20\",\"19\"),\n", - " (\"21\",\"20\"),\n", - " (\"22\",\"21\"),\n", - " (\"23\",\"22\"),\n", - " (\"24\",\"23\"),\n", - " (\"0\",\"23\"),\n", - "])\n", - "minutes_to = pynini.string_map([\n", - " (\"59\", \"01\"),\n", - " (\"58\", \"02\"),\n", - " (\"57\", \"03\"),\n", - " (\"56\", \"04\"),\n", - " (\"55\", \"05\"),\n", - " (\"54\", \"06\"),\n", - " (\"53\", \"07\"),\n", - " (\"52\", \"08\"),\n", - " (\"51\", \"09\"),\n", - " (\"50\", \"10\"),\n", - " (\"49\", \"11\"),\n", - " (\"48\", \"12\"),\n", - " (\"47\", \"13\"),\n", - " (\"46\", \"14\"),\n", - " (\"45\", \"15\"),\n", - " (\"44\", \"16\"),\n", - " (\"43\", \"17\"),\n", - " (\"42\", \"18\"),\n", - " (\"41\", \"19\"),\n", - " (\"40\", \"20\"),\n", - " (\"39\", \"21\"),\n", - " (\"38\", \"22\"),\n", - " (\"37\", \"23\"),\n", - " (\"36\", \"24\"),\n", - " (\"35\", \"25\"),\n", - " (\"34\", \"26\"),\n", - " (\"33\", \"27\"),\n", - " (\"32\", \"28\"),\n", - " (\"31\", \"29\"),\n", - " (\"30\", \"30\"),\n", - " (\"29\", \"31\"),\n", - " (\"28\", \"32\"),\n", - " (\"27\", \"33\"),\n", - " (\"26\", \"34\"),\n", - " (\"25\", \"35\"),\n", - " (\"24\", \"36\"),\n", - " (\"23\", \"37\"),\n", - " (\"22\", \"38\"),\n", - " (\"21\", \"39\"),\n", - " (\"20\", \"40\"),\n", - " (\"19\", \"41\"),\n", - " (\"18\", \"42\"),\n", - " (\"17\", \"43\"),\n", - " (\"16\", \"44\"),\n", - " (\"15\", \"45\"),\n", - " (\"14\", \"46\"),\n", - " (\"13\", \"47\"),\n", - " (\"12\", \"48\"),\n", - " (\"11\", \"49\"),\n", - " (\"10\", \"50\"),\n", - " (\"09\", \"51\"),\n", - " (\"08\", \"52\"),\n", - " (\"07\", \"53\"),\n", - " (\"06\", \"54\"),\n", - " (\"05\", \"55\"),\n", - " (\"04\", \"56\"),\n", - " (\"03\", \"57\"),\n", - " (\"02\", \"58\"),\n", - " (\"01\", \"59\"),\n", - "])\n", - "graph_moins = pynutil.delete(\"moins\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XOKETkIYZy5M" - }, - "source": [ - "Why graph the digits instead of the tokens themselves? Along with avoiding some minor repetition and making editing more apparent, it allows this subgraph to be ported to other languages - if so desired.\n", - "\n", - "Further, it helps us illustrate a helpful idea within this tutorial: as long as a pattern is regular and/or finite, it is no major issue to accommodate it in our graph, regardless of mathematic or logic system it employs." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DJbFiD2fAUc5" - }, - "source": [ - "## Classifier" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cK0SGXntaDkI" - }, - "source": [ - "Once again we place the grammar within the proper child class of `GraphFst`. We also insert the proper tags for the `Time` class, which are:\n", - "- `hours`\n", - "- `minutes`\n", - "- `suffix` (explained within this section)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9Eq5r-_VbBIg" - }, - "outputs": [], - "source": [ - "graph_hours_component = pynini.union(hours, graph_midi, graph_minuit)\n", - "graph_hours_component = pynutil.insert(\"hours: \\\"\") + graph_hours_component + pynutil.insert(\"\\\"\")\n", - "\n", - "graph_minutes_component = (\n", - " pynutil.insert(\" minutes: \\\"\") + pynini.union(minutes, graph_fractions) + pynutil.insert(\"\\\"\")\n", - ") \n", - "graph_minutes_component = delete_space + graph_minutes_component\n", - "\n", - "graph_time_standard = (graph_hours_component + delete_space + graph_heures \n", - " + pynini.closure(graph_minutes_component, 0, 1))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2avfS3IacSiC" - }, - "source": [ - "We now setup the alternate graph that allows backwards counting. Note, this is triggered by the occurrence of \"moins\" between the hour and minute component." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "TmpwisOVcn0T" - }, - "outputs": [], - "source": [ - "graph_hours_to_component = hours | graph_midi | graph_minuit\n", - "graph_hours_to_component @= hours_to\n", - "graph_hours_to_component = pynutil.insert(\"hours: \\\"\") + graph_hours_to_component + pynutil.insert(\"\\\"\")\n", - "graph_hours_to_component = graph_hours_to_component + delete_space + graph_heures\n", - "\n", - "graph_minutes_to_component = (minutes | graph_demi | # No 'et' in fractions\n", - " (pynutil.delete(\"le \") + graph_quart) | graph_trois_quart)\n", - "graph_minutes_to_component @= minutes_to\n", - "graph_minutes_to_component = pynutil.insert(\" minutes: \\\"\") + graph_minutes_to_component + pynutil.insert(\"\\\"\")\n", - "\n", - "graph_time_to = graph_hours_to_component + delete_space + graph_moins + delete_space + graph_minutes_to_component" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FkO4tRRfdQT4" - }, - "source": [ - "We now join it with our main component, allowing us to graph all times:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0O0vUVizdU8c" - }, - "outputs": [], - "source": [ - "graph_time = graph_time_standard | graph_time_to" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jbX4JV-LdY3Y" - }, - "source": [ - "Once again we throw a wrench into things with the `suffix` feature. As in the case of Ordinals and Decimals, key-words can play into our Time WFST. For French, this occurs with the words \"du matin\", \"de l'après-midi\", and \"du soir\". (Respectively: \"in the morning\", \"in the afternoon\", and \"in the evening\".) Much like in English, these phrases alter how we write down the time. But instead of indicating `a.m.` or `p.m.`, these indicate *what hour system is used*. For example:\n", - "- \"deux heures du matin\" -> `2 h` = `2:00 a.m.`\n", - "- \"deux heures de l'après-midi\" -> `14 h` = `2:00 p.m.`\n", - "\n", - "Only a twelve hour system is used when these suffixes accompany the time. As such, our Classifier will need to either adjust the times like in the case of counting backwards or must pass the information to the Verbalizer so it can adjust. \n", - "\n", - "Since our Classifier is long enough as is, we will simply store this information in the `suffix` property and allow the Verbalizer to manage." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "OqVa78zRgJw9" - }, - "outputs": [], - "source": [ - "graph_suffix_am = pynini.cross(\"du matin\", \"am\")\n", - "graph_suffix_pm = pynini.string_map([(\"de l'après-midi\", \"pm\"),(\"du soir\", \"pm\")])\n", - "\n", - "graph_suffix = pynini.cross(graph_suffix_am, \"am\") | pynini.cross(graph_suffix_pm, \"pm\")\n", - "\n", - "graph_suffix_component = pynutil.insert(\" suffix: \\\"\") + graph_suffix + pynutil.insert(\"\\\"\")\n", - "graph_suffix_component = delete_space + graph_suffix_component\n", - "graph_suffix_component = pynini.closure(graph_suffix_component, 0, 1)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-LaJMIjUf1XR" - }, - "source": [ - "And we append to our graph:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "76myCFiggX3E" - }, - "outputs": [], - "source": [ - "class TimeFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"time\", kind=\"classify\")\n", - " \"\"\"grammar omitted for length\n", - " ....\n", - " ....\n", - " ....\n", - " \"\"\"\n", - " graph_hours_component = pynini.union(hours, graph_midi, graph_minuit)\n", - " graph_hours_component = pynutil.insert(\"hours: \\\"\") + graph_hours_component + pynutil.insert(\"\\\"\")\n", - "\n", - " graph_minutes_component = (\n", - " pynutil.insert(\" minutes: \\\"\") + pynini.union(minutes, graph_fractions) + pynutil.insert(\"\\\"\")\n", - " ) \n", - " graph_minutes_component = delete_space + graph_minutes_component\n", - "\n", - " graph_time_standard = (graph_hours_component + delete_space + graph_heures \n", - " + pynini.closure(graph_minutes_component, 0, 1))\n", - "\n", - " graph_hours_to_component = hours | graph_midi | graph_minuit\n", - " graph_hours_to_component @= hours_to\n", - " graph_hours_to_component = pynutil.insert(\"hours: \\\"\") + graph_hours_to_component + pynutil.insert(\"\\\"\")\n", - " graph_hours_to_component = graph_hours_to_component + delete_space + graph_heures\n", - "\n", - " graph_minutes_to_component = (minutes | graph_demi | # No 'et' in fractions\n", - " (pynutil.delete(\"le \") + graph_quart) | graph_trois_quart)\n", - " graph_minutes_to_component @= minutes_to\n", - " graph_minutes_to_component = pynutil.insert(\" minutes: \\\"\") + graph_minutes_to_component + pynutil.insert(\"\\\"\")\n", - "\n", - " graph_time_to = graph_hours_to_component + delete_space + graph_moins + delete_space + graph_minutes_to_component\n", - "\n", - " graph_time_no_suffix = graph_time_standard | graph_time_to\n", - "\n", - " graph_suffix_am = pynini.cross(\"du matin\", \"am\")\n", - " graph_suffix_pm = pynini.string_map([(\"de l'après-midi\", \"pm\"),(\"du soir\", \"pm\")])\n", - "\n", - " graph_suffix = pynini.cross(graph_suffix_am, \"am\") | pynini.cross(graph_suffix_pm, \"pm\")\n", - "\n", - " graph_suffix_component = pynutil.insert(\" suffix: \\\"\") + graph_suffix + pynutil.insert(\"\\\"\")\n", - " graph_suffix_component = delete_space + graph_suffix_component\n", - " graph_suffix_component = pynini.closure(graph_suffix_component, 0, 1)\n", - " \n", - " final_graph = graph_time_no_suffix + graph_suffix_component\n", - "\n", - " final_graph = self.add_tokens(final_graph)\n", - "\n", - " self.fst = final_graph.optimize()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's see how we did:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "time = TimeFst().fst\n", - "example = \"quatre heures moins cinq\"\n", - "apply_fst(example, time)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lPlJ1qyeAWOL" - }, - "source": [ - "## Verbalizer" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CrO-xtJ87PEl" - }, - "source": [ - "The initial part of the Verbalizer should appear familiar. We delete the property tags `hours` and `minutes`, making sure they preserve the actual values for formatting." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fCzZKR7ek0Mz" - }, - "outputs": [], - "source": [ - "hour = (\n", - " pynutil.delete(\"hours:\")\n", - " + delete_space\n", - " + pynutil.delete(\"\\\"\")\n", - " + pynini.closure(NEMO_DIGIT, 1, 2)\n", - " + pynutil.delete(\"\\\"\")\n", - ")\n", - "minute = (\n", - " pynutil.delete(\"minutes:\")\n", - " + delete_extra_space\n", - " + pynutil.delete(\"\\\"\")\n", - " + pynini.closure(NEMO_DIGIT, 1, 2)\n", - " + pynutil.delete(\"\\\"\")\n", - ")\n", - "graph = hour + delete_extra_space + pynutil.insert(\"h\") + minute.ques" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WnVV9GUKk-b7" - }, - "source": [ - "We then deal with the case of `suffix`. We first note that if the suffix is for a morning time (before noon), then there is no further conversion that is needed. We may simply delete the property and its value." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "haOEiSbglc6s" - }, - "outputs": [], - "source": [ - "day_suffixes = pynutil.delete(\"suffix: \\\"am\\\"\")\n", - "\n", - "graph = hours + delete_extra_space + pynutil.insert(\"h\") + minute.ques + delete_space + day_suffixes.ques" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wL0FNg6Xlhb-" - }, - "source": [ - "Meanwhile, the post-noon suffixes would require us shifting the hours value by twelve. Much like in the case of counting backwards from the hour, we can simply create a WFST to do this addition work for us." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "YLrabUNplwG7" - }, - "outputs": [], - "source": [ - "hour_to_night = pynini.string_map([\n", - " (\"1\", \"13\"),\n", - " (\"2\", \"14\"),\n", - " (\"3\", \"15\"),\n", - " (\"4\", \"16\"),\n", - " (\"5\", \"17\"),\n", - " (\"6\", \"18\"),\n", - " (\"7\", \"19\"),\n", - " (\"8\", \"20\"),\n", - " (\"9\", \"21\"),\n", - " (\"10\", \"22\"),\n", - " (\"11\", \"23\"), # Note that 12 and 24 would be phrased \"midi\" and \"minuit\" respectively\n", - "])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "X0-z-qJAmIiI" - }, - "source": [ - "We then create an alternate graph where this conversion is mapped onto the hours function - given a post-noon suffix - and create a union with our earlier graph:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8CdEmo9NmN7u" - }, - "outputs": [], - "source": [ - "night_suffixes = pynutil.delete(\"suffix: \\\"pm\\\"\")\n", - "graph |= (\n", - " hour @ hour_to_night\n", - " + delete_extra_space\n", - " + pynutil.insert(\"h\")\n", - " + minute.ques\n", - " + delete_space\n", - " + night_suffixes\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YnoIkZBqmaTo" - }, - "source": [ - "Giving us a final Verbalizer of:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZfXimvFBmdDD" - }, - "outputs": [], - "source": [ - "class TimeFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"time\", kind=\"verbalize\")\n", - "\n", - " hour_to_night = pynini.string_map([\n", - " (\"1\", \"13\"),\n", - " (\"2\", \"14\"),\n", - " (\"3\", \"15\"),\n", - " (\"4\", \"16\"),\n", - " (\"5\", \"17\"),\n", - " (\"6\", \"18\"),\n", - " (\"7\", \"19\"),\n", - " (\"8\", \"20\"),\n", - " (\"9\", \"21\"),\n", - " (\"10\", \"22\"),\n", - " (\"11\", \"23\"),\n", - "])\n", - "\n", - " day_suffixes = pynutil.delete(\"suffix: \\\"am\\\"\")\n", - " night_suffixes = pynutil.delete(\"suffix: \\\"pm\\\"\")\n", - "\n", - " hour = (\n", - " pynutil.delete(\"hours:\")\n", - " + delete_space\n", - " + pynutil.delete(\"\\\"\")\n", - " + pynini.closure(NEMO_DIGIT, 1, 2)\n", - " + pynutil.delete(\"\\\"\")\n", - " )\n", - " minute = (\n", - " pynutil.delete(\"minutes:\")\n", - " + delete_extra_space\n", - " + pynutil.delete(\"\\\"\")\n", - " + pynini.closure(NEMO_DIGIT, 1, 2)\n", - " + pynutil.delete(\"\\\"\")\n", - " )\n", - "\n", - " graph = hour + delete_extra_space + pynutil.insert(\"h\") + minute.ques + delete_space + day_suffixes.ques\n", - "\n", - " graph |= (\n", - " hour @ hour_to_night\n", - " + delete_extra_space\n", - " + pynutil.insert(\"h\")\n", - " + minute.ques\n", - " + delete_space\n", - " + night_suffixes\n", - " )\n", - " delete_tokens = self.delete_tokens(graph)\n", - " self.fst = delete_tokens.optimize()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e5tPcCaSYuhY" - }, - "source": [ - "If you've noticed, the Verbalizer process has become simpler as we've progressed through our WFSTs. Commonly, you will seldom need to even provide the amount of overhead we've seen in `TimeFst`, `MoneyFst`, and `OrdinalFst`, and the majority of this component is simply removing tokens as an intermediary step, as we'll see for our Name class." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iHmRe3UIhyIH" - }, - "source": [ - "# WhiteList WFST " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8kMn2qB9bVFy" - }, - "source": [ - "\n", - "While developing your grammars, you may encounter tokens that refuse standard categorization and yet still require normalization. For example, you may need to render \"Mister Brown\" as `Mr. Brown` or \"H M S Nelson\" as `H.M.S. Nelson`. As these cases are rather specific, they lack a regular pattern for a specific classifier. (What about \"mister\" as a token requires tokenization as opposed to \"Brown\".) Instead, we need to explicitly list their input-output mappings (i.e. a whitelist).\n", - "\n", - "For NeMo, this is performed through the `WhiteListFst`:\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6B4oPXYcccWs" - }, - "source": [ - "## Grammar" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RThTLUCRceOO" - }, - "source": [ - "`WhitelistFst` is essentially just a wrapper for a `string_map` or `string_file` mapping with the appropriate formatting for deployment. Per our example, we can make a graph with the following:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "eIOOb_wJdMMx" - }, - "outputs": [], - "source": [ - "graph = pynini.string_map([\n", - " (\"mister\", \"mr.\"),\n", - " (\"h m s\", \"h.m.s\"),\n", - " (\"doctor\", \"dr.\")\n", - "])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "O5kTXwmPZ9Tt" - }, - "source": [ - "As previously mentioned, here is where the use of `string_file` will make maintenance much easier. Discovering whitelist mappings is an iterative process and you will more than likely need to return to your list throughout development. For instance, it may be obvious that tokens such as \"madame\", \"miss\", \"esquire\", but would you think of providing abbreviations for \"the right honorable\" or \"tennessee valley authority\"? Keeping a tsv file available for quick insertions greatly assists here." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RC5Cf-Z8dYVk" - }, - "source": [ - "## Classifier" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "144nvAHEdfBJ" - }, - "source": [ - "Unlike for our other WFSTs, There is no specific semiotic class for `WhiteListFst`. It instead falls under the default Name class to designate there is no need for further processing beyond obligatory tokenization. Indeed, we can simply insert the token ourselves instead of calling `add_tokens`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oPkrmg2gdznd" - }, - "outputs": [], - "source": [ - "class WhiteListFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"whitelist\", kind=\"classify\")\n", - "\n", - " whitelist = pynini.string_map([\n", - " (\"mister\", \"mr.\"),\n", - " (\"h m s\", \"h.m.s\"),\n", - " (\"doctor\", \"dr.\")])\n", - " graph = pynutil.insert(\"name: \\\"\") + convert_space(whitelist) + pynutil.insert(\"\\\"\")\n", - " self.fst = graph.optimize()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "B05kdSIdd2dv" - }, - "source": [ - "## Verbalizer" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since the whitelisted token has already been rendered in the desired normalized form, all that is necessary is to strip the `name` token and render the string 'as is'. This can be done by through the following:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gaq3voIYiUCA" - }, - "outputs": [], - "source": [ - "class WhiteListFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"whitelist\", kind=\"verbalize\")\n", - " graph = (\n", - " pynutil.delete(\"name:\")\n", - " + delete_space\n", - " + pynutil.delete(\"\\\"\")\n", - " + pynini.closure(NEMO_CHAR - \" \", 1)\n", - " + pynutil.delete(\"\\\"\")\n", - " )\n", - " graph = graph @ pynini.cdrewrite(pynini.cross(u\"\\u00A0\", \" \"), \"\", \"\", NEMO_SIGMA) # Removes possible null token\n", - " self.fst = graph.optimize()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cUE7Gg35bWKb" - }, - "source": [ - "While the graph is largely self-explanatory, take note that the default implementation assumes a character string without spacing. If you intend to include additional formatting in your normalization (e.g. `H. M. S.` instead of `H.M.S.`), you may need to adjust the graph to expand coverage." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_o_a15Fg7niv" - }, - "source": [ - "# Word and Punctuation WFST " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Zi6lP7mTmnUV" - }, - "source": [ - "Continuing with the Name class, we will conclude with the Word and Punctuation WFSTs. These are among the simplest and most crucial classes of the entire ITN system, as they classify all tokens that are not caught by other semiotic classes. Since these other tokens make up the majority of all strings your normalization system will encounter, they are essential for general functionality.\n", - "\n", - "However, they escape discussion as their function is self-evident: since they function as default classes, tokens only reach Word WFST and Punctuation WFST if they have not been accepted by the other WFSTs. As such, we can simply accept the tokens as they are, providing them a `name` tag." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9zCqczLqp5NW" - }, - "source": [ - "## Classifier" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eUWum5U0p99c" - }, - "source": [ - "For instance, consider the entire `WordFst` Classifier in its entirety:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "CCZSTeDHofDl" - }, - "outputs": [], - "source": [ - "class WordFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"word\", kind=\"classify\")\n", - " word = pynutil.insert(\"name: \\\"\") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert(\"\\\"\")\n", - " self.fst = word.optimize()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9ys2VpjjoiEC" - }, - "source": [ - "It just processes the entire token string with the `NEMO_NOT_SPACE` utility WFST (which accepts any string that is not a space). For your language, you may simply use one of the preexisting `WordFst`.\n", - "\n", - "Depending on language, the `PunctuationFst` may require some (minimal) adjustment. Note the following:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Mnnd3PVMpF4t" - }, - "outputs": [], - "source": [ - "class PunctuationFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"punctuation\", kind=\"classify\")\n", - "\n", - " s = \"!#$%&\\'()*+,-./:;<=>?@^_`{|}~\"\n", - " punct = pynini.union(*s)\n", - "\n", - " graph = pynutil.insert(\"name: \\\"\") + punct + pynutil.insert(\"\\\"\")\n", - "\n", - " self.fst = graph.optimize()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_afW02LXpLtz" - }, - "source": [ - "If your language uses other punctuation than that in the `s` string (or reserves some of the punctuation as characters), you may simply edit `s` to accommodate. \n", - "\n", - "For instance, French has a unique quotation style that utilizes guillemets \"« »\". We may add their Unicode codepoints (to avoid encoding issues) to `s`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mgfZIKzVplVm" - }, - "outputs": [], - "source": [ - "class PunctuationFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"punctuation\", kind=\"classify\")\n", - "\n", - " s = \"!#$%&\\'()*+,-./:;<=>?@^_`{|}~\"\n", - " guillemets = \"\\u00AB\" + \"\\u00BB\" # quotation marks in French.\n", - " s += guillemets\n", - " punct = pynini.union(*s)\n", - "\n", - " graph = pynutil.insert(\"name: \\\"\") + punct + pynutil.insert(\"\\\"\")\n", - "\n", - " self.fst = graph.optimize()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6Upb5-wcp_7H" - }, - "source": [ - "## Verbalizer" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ufWT1T6GqCCT" - }, - "source": [ - "Note that both `PunctuationFst` and `WordFst` both encode with the `name` property. This leaves no differentiation between the two for a Verbalizer. This makes sense as there are no particular formatting rules for them, they simply need a placeholder tag to avoid alteration between the Classifier and Verbalizer step. Once passed to the verbalizer, they are rendered as normal by simply removing the tag (this is practically identical to the WhiteListFST):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "LqyhqQKZqcph" - }, - "outputs": [], - "source": [ - "class WordFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"word\", kind=\"verbalize\")\n", - " chars = pynini.closure(NEMO_CHAR - \" \", 1)\n", - " char = pynutil.delete(\"name:\") + delete_space + pynutil.delete(\"\\\"\") + chars + pynutil.delete(\"\\\"\")\n", - " graph = char @ pynini.cdrewrite(pynini.cross(u\"\\u00A0\", \" \"), \"\", \"\", NEMO_SIGMA) # Cleans up possible null character\n", - "\n", - " self.fst = graph.optimize()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lGbrUkcpapyi" - }, - "source": [ - "For many languages, the writing of your `WordFst` and `PunctuationFst` (both Classifiers and Verbalizers) will require no more than duplicating the preexisting grammars found in NeMo Text Processing." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5y9jhkhQ7p4W" - }, - "source": [ - "# Other Classes " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "j1mgnISmiu-g" - }, - "source": [ - "While the preceding discussion should be suitable for development of the remaining classes, some helpful notes may be of use before continuing:\n", - "- Fraction WFST: This is the last of the 'fundamental' classes and should take priority after completion of the Decimal WFST. It operates very similarly to the Ordinal WFST in that you wish to recover the Cardinal roots for the numerator and denominator prior to tagging. Its properties are: `negative`, `integer_part`, `numerator`, and `denominator`.\n", - "- Measure WFST: Like the Money WFST, this will require management of several 'parent' WFSTS (Fraction, Cardinal, Decimal) to be suitably comprehensive. As well, you may find it more productive to find ways to compose new measurement units instead of simply listing all (e.g. micrometers, petameters, miles per hour, feet per second). Its properties are: `negative`, `units` and it allows subgraphs of the `cardinal`, `decimal`, and `fraction` classes. (This is, it allows tokenization within the tokenization.)\n", - "- Date WFST: Depending on writing conventions, this may vary in complexity. For instance, English speakers may write dates as `01/01/2021/` or `Jan. 1 2021`. Are there specific use cases where one is preferred or should you simply decide on a format? Further, you may wish to take advantage of the `preserve order` property to avoid possible unwanted verbalizations (some implementations will permit both `Jan. 1` and `1 Jan.` if not careful.) Its properties are: `month`, `day`, and `year`. \n", - "- Telephone WFST: These will be heavily dependent not only on writing conventions but even regional preference. For instance, the U.S. commonly uses a ten digit system broken into the following sequence: `###-###-####`. Meanwhile, mainland France breaks a ten digit sequence into groups of two: `##-##-##-##-##`. Take careful note of how your language's target region verbalizes these figures and leave room for some variation in development. The `telephone` class has only one property: `number_part`. \n", - "- Electronic WFST: For normalizing email addresses or urls, you will need to develop for the `electronic` class. The main concerns will be managing alphanumeric strings and parsing the reserved symbols used for protocols and domains. (How does your target language pronounce \"https://\"? www? '.' or '@'?\") Depending on whether you are normalizing a url or email, the following properties will be needed:\n", - " - email: `username`, `domain`\n", - " - url: `protocol` (Sparrowhawk allows further detail here but NeMo passes the entire url through the `protocol` property)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-i25X8mK90n3" - }, - "source": [ - "# Tokenize and Classify " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "v4bcigU6b9ss" - }, - "source": [ - "We are now ready to build a general Classifier for our entire language. Upon completion of your grammars, the next step is to unite them together in a general Classifier WFST - located within a `tokenize_and_classify.py` file, preferably. This WFST will be responsible for determining the appropriate semiotic class for each token in your string and processing the necessary properties for normalization.\n", - "\n", - "For this section, we will focus on the following: grammar composition, assignment of weights, and importing/exporting as a FAR file. Since we will need to work with some instantiated graphs, let's preload them before proceeding. (Note the compilingtime.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from nemo_text_processing.inverse_text_normalization.fr.taggers.cardinal import CardinalFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.taggers.decimal import DecimalFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.taggers.money import MoneyFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.taggers.ordinal import OrdinalFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.taggers.punctuation import PunctuationFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.taggers.time import TimeFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.taggers.whitelist import WhiteListFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.taggers.word import WordFst\n", - "\n", - "cardinal = CardinalFst()\n", - "cardinal_graph = cardinal.fst\n", - "\n", - "ordinal = OrdinalFst(cardinal)\n", - "ordinal_graph = ordinal.fst\n", - "\n", - "decimal = DecimalFst(cardinal)\n", - "decimal_graph = decimal.fst\n", - "\n", - "whitelist_graph = WhiteListFst().fst\n", - "word_graph = WordFst().fst\n", - "time_graph = TimeFst().fst\n", - "money_graph = MoneyFst(cardinal, decimal).fst\n", - "punct_graph = PunctuationFst().fst" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MIv58eSocOV1" - }, - "source": [ - "## Grammar" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "k_RPlnfVdG5E" - }, - "source": [ - "As for all previous grammars, the `tokenize_and_classify` grammar inherits from a `GraphFst` as an individual class: `ClassifyFst`. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WHKG4c2WdW0G" - }, - "outputs": [], - "source": [ - "class ClassifyFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "j9_I6DJmdcOG" - }, - "source": [ - "This class is responsible for instantiating all subgraphs and passing necessary dependencies:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4YtmcxLOdlas" - }, - "outputs": [], - "source": [ - "class ClassifyFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")\n", - "\n", - " cardinal = CardinalFst()\n", - " cardinal_graph = cardinal.fst\n", - "\n", - " ordinal = OrdinalFst(cardinal)\n", - " ordinal_graph = ordinal.fst\n", - "\n", - " decimal = DecimalFst(cardinal)\n", - " decimal_graph = decimal.fst\n", - "\n", - " whitelist_graph = WhiteList().fst\n", - " word_graph = WordFst().fst\n", - " time_graph = TimeFst().fst\n", - " money_graph = MoneyFst(cardinal, decimal).fst\n", - " punct_graph = PunctuationFst().fst" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "y5vGvv3HeAY9" - }, - "source": [ - "We then join all the grammars together so `ClassifyFst` can apply them. Rather unceremoniously, this is accomplished by performing a union across all grammars (excluding `PunctuationFst`, to assist tokenization). We then follow this union by inserting the `tokens` class around the resulting formatting (required for processing):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oocgPQ5geZJO" - }, - "outputs": [], - "source": [ - "class ClassifyFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")\n", - "\n", - " cardinal = CardinalFst()\n", - " cardinal_graph = cardinal.fst\n", - "\n", - " ordinal = OrdinalFst(cardinal)\n", - " ordinal_graph = ordinal.fst\n", - "\n", - " decimal = DecimalFst(cardinal)\n", - " decimal_graph = decimal.fst\n", - "\n", - " whitelist_graph = WhiteListFst().fst\n", - " word_graph = WordFst().fst\n", - " time_graph = TimeFst().fst\n", - " money_graph = MoneyFst(cardinal, decimal).fst\n", - " punct_graph = PunctuationFst().fst\n", - "\n", - " classify = (\n", - " time_graph\n", - " | whitelist_graph\n", - " | decimal_graph\n", - " | cardinal_graph\n", - " | ordinal_graph\n", - " | money_graph\n", - " | word_graph\n", - " )\n", - " token = pynutil.insert(\"tokens { \") + classify + pynutil.insert(\" }\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ASWDXWQjfLEU" - }, - "source": [ - "Our graph is now able to process an individual token. But what about a string? Here you will need to be mindful of the tokenization behavior for your language and decide on your desired treatment of punctuation (hence exclusion from the main graph). \n", - "\n", - "For our purposes, we will assume the convention of space and punctuation serving as token separators. We graph punctuation as individual tokens" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "r6WztK2jwhFt" - }, - "outputs": [], - "source": [ - "punct_graph = PunctuationFst().fst\n", - "punct = pynutil.insert(\"tokens { \") + punct_graph + pynutil.insert(\" }\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9T2rT89jw3T1" - }, - "source": [ - "and join the `punct` graph with our `tokens` graph (inserting spaces between tokens for formatting)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rGtVOK-txKOP" - }, - "outputs": [], - "source": [ - "token = \"PLACEHOLDER\"\n", - "token_plus_punct = (\n", - " pynini.closure(punct + pynutil.insert(\" \")) + token + pynini.closure(pynutil.insert(\" \") + punct)\n", - " ) # Note the use of closure incase there are multiple punctuations\n", - "graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_gixfQ69xWPe" - }, - "source": [ - "then address space between tokens: \n", - "\n", - "`graph = delete_space + graph + delete_space`" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DWnmazWecyUG" - }, - "source": [ - "## Weighting " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "egHbwIbMx-hT" - }, - "source": [ - "Were we to leave our `ClassifyFst` like this, we would undoubtedly encounter a mountain of errors. What will stop our graph from treating punctuation that is part of a previous grammar as a token separator (e.g. \"vingt-et-un\")? How do we ensure that a currency string isn't treated as solely a decimal string with a `name` token following?\n", - "\n", - "As in previous cases, the solution lies in our choice of weights for the grammar." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "y3U7_M8CyxZ1" - }, - "source": [ - "Let us return to the main graph:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9VXe1dfsy3Be" - }, - "outputs": [], - "source": [ - "classify = (\n", - " time_graph\n", - " | whitelist_graph\n", - " | decimal_graph\n", - " | cardinal_graph\n", - " | ordinal_graph\n", - " | money_graph\n", - " | word_graph\n", - " )\n", - "punct = pynutil.insert(\"tokens { \") + punct_graph + pynutil.insert(\" }\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aY4vOFqxy5ua" - }, - "source": [ - "Beyond the path weights that we explicitly added, these graphs are currently weightless. Since we want the graphs themselves to be the general determiners of a path, let us use some default weights an order of magnitude beyond our path weights (we use `pynutil.add_weight`):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "bthyt_Le2rsA" - }, - "outputs": [], - "source": [ - "classify = (\n", - " pynutil.add_weight(time_graph, 1)\n", - " | pynutil.add_weight(whitelist_graph, 1)\n", - " | pynutil.add_weight(decimal_graph, 1)\n", - " | pynutil.add_weight(cardinal_graph, 1)\n", - " | pynutil.add_weight(ordinal_graph, 1)\n", - " | pynutil.add_weight(money_graph, 1)\n", - " | pynutil.add_weight(word_graph, 1)\n", - " )\n", - "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1) + pynutil.insert(\" }\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xMNIJbzj3MMP" - }, - "source": [ - "Let's see what logical adjustments should be made. First off, we know that we want each class token to span the largest string possible. (e.g. We don't want \"quatre-vingt\" to be rendered as two `cardinal` classes with a hyphen in between.) As such, we want to penalize our graph for using more than one tokens. We can do so by establishing the following constraint: the sum of two or more tokens cannot be less than the weight of a single token. Or, for any pair of tokens `w_1` and `w_2`, their sum must always be greater than any other individual token (including themselves):\n", - "\n", - "`w_1 + w_2 > k >= w`\n", - "\n", - "To keep things simple, let us make the upper limit `2`. This means we should increase all the weights to keep our constraint:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "classify = (\n", - " pynutil.add_weight(time_graph, 1.1)\n", - " | pynutil.add_weight(whitelist_graph, 1.1)\n", - " | pynutil.add_weight(decimal_graph, 1.1)\n", - " | pynutil.add_weight(cardinal_graph, 1.1)\n", - " | pynutil.add_weight(ordinal_graph, 1.1)\n", - " | pynutil.add_weight(money_graph, 1.1)\n", - " | pynutil.add_weight(word_graph, 1.1)\n", - " )\n", - "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1.1) + pynutil.insert(\" }\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Do we want this constraint to include all tokens? Imagine if we had a string of multiple semiotic tokens in a row. Since this string's combined weight would be larger than any single class token, a grammar that served as a universal acceptor (i.e. `word_graph`) would be preferred over these individual classes. This would be obviously incorrect. As such, we want to make sure that `word_graph` would only be traversed when there is truly no other option:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qc_CU2ro63eg" - }, - "outputs": [], - "source": [ - "classify = (\n", - " pynutil.add_weight(time_graph, 1.1)\n", - " | pynutil.add_weight(whitelist_graph, 1.1)\n", - " | pynutil.add_weight(decimal_graph, 1.1)\n", - " | pynutil.add_weight(cardinal_graph, 1.1)\n", - " | pynutil.add_weight(ordinal_graph, 1.1)\n", - " | pynutil.add_weight(money_graph, 1.1)\n", - " | pynutil.add_weight(word_graph, 100)\n", - " )\n", - "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1.1) + pynutil.insert(\" }\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, even with a string of fifty different class tokens, `word_graph` would still not be considered as a path to traverse." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fW8C3vD-7Dbl" - }, - "source": [ - "Next, let us consider our foundational graph: `cardinal_graph`. As Cardinals occur in practically all our WFSTs, it's possible for `cardinal_graph` to apply in almost all cases. Yet, we've specifically invoked `CardinalFST` when it was required in any of the other classes, so it will never be needed in any of those cases. This means that we want all those graphs to have *priority* over `cardinal_graph`. As such, we will increase its weight so it takes second lowest precedence (while still paying attention to the combined weight constraint). " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "97UwGaEn8pj7" - }, - "outputs": [], - "source": [ - "classify = (\n", - " pynutil.add_weight(time_graph, 1.1)\n", - " | pynutil.add_weight(whitelist_graph, 1.1)\n", - " | pynutil.add_weight(decimal_graph, 1.1)\n", - " | pynutil.add_weight(cardinal_graph, 1.2)\n", - " | pynutil.add_weight(ordinal_graph, 1.1)\n", - " | pynutil.add_weight(money_graph, 1.1)\n", - " | pynutil.add_weight(word_graph, 100)\n", - " )\n", - "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1.1) + pynutil.insert(\" }\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0d9Lw4Ot88_B" - }, - "source": [ - "This form of thinking can be applied to all the 'foundational' graphs you may develop: the dependent graphs should take higher precedence than the graphs they borrow from. For instance, since `money_graph` utilizes `decimal_graph`, we know it should take precedence. However, since `decimal_graph` borrows from `cardinal_graph`, its weight must still be less than `1.2`. As such: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-wF8cgLK9tpU" - }, - "outputs": [], - "source": [ - "classify = (\n", - " pynutil.add_weight(time_graph, 1)\n", - " | pynutil.add_weight(whitelist_graph, 1)\n", - " | pynutil.add_weight(decimal_graph, 1.1)\n", - " | pynutil.add_weight(cardinal_graph, 1.2)\n", - " | pynutil.add_weight(ordinal_graph, 1)\n", - " | pynutil.add_weight(money_graph, 1.09)\n", - " | pynutil.add_weight(word_graph, 100)\n", - " )\n", - "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1) + pynutil.insert(\" }\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "huMzDoZ2-FD2" - }, - "source": [ - "For those classes that don't seem affected, we can set their weights as the same as those below their 'foundation' graphs, simply to prevent prioritization when not required\n", - "\n", - "Meanwhile, `whitelist_graph` should take precedence over all others, as it may contain unique normalizations that may get accidentally caught by the other graphs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gWG6ttyd-bbD" - }, - "outputs": [], - "source": [ - "classify = (\n", - " pynutil.add_weight(time_graph, 1.1)\n", - " | pynutil.add_weight(whitelist_graph, 1.07)\n", - " | pynutil.add_weight(decimal_graph, 1.1)\n", - " | pynutil.add_weight(cardinal_graph, 1.2)\n", - " | pynutil.add_weight(ordinal_graph, 1.1)\n", - " | pynutil.add_weight(money_graph, 1.08)\n", - " | pynutil.add_weight(word_graph, 100)\n", - " )\n", - "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1.1) + pynutil.insert(\" }\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1TH08f8O-fWx" - }, - "source": [ - "Keep in mind that building weights in this manner is hardly a rule for grammar development and is instead intended as a means to initialize weights for empirical development. You will find that actual strings will cause unexpected behavior that require fine tuning. \n", - "\n", - "For instance, the Classifier for French in NeMo ITN benefits from having varying precedence for some weights, as seen in the following excerpt:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gKdkyDK3_r46" - }, - "outputs": [], - "source": [ - "class ClassifyFst(GraphFst):\n", - " \"\"\"\n", - " Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.\n", - " For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. \n", - " More details to deployment at NeMo/tools/text_processing_deployment.\n", - "\n", - " Args:\n", - " cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.\n", - " overwrite_cache: set to True to overwrite .far files\n", - " \"\"\"\n", - "\n", - " def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):\n", - " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")\n", - "\n", - " far_file = None\n", - " if cache_dir is not None and cache_dir != \"None\":\n", - " os.makedirs(cache_dir, exist_ok=True)\n", - " far_file = os.path.join(cache_dir, \"_fr_itn.far\")\n", - " if not overwrite_cache and far_file and os.path.exists(far_file):\n", - " self.fst = pynini.Far(far_file, mode=\"r\")[\"tokenize_and_classify\"]\n", - " logging.info(f\"ClassifyFst.fst was restored from {far_file}.\")\n", - " else:\n", - " logging.info(f\"Creating ClassifyFst grammars.\")\n", - "\n", - " cardinal = CardinalFst()\n", - " cardinal_graph = cardinal.fst\n", - "\n", - " fraction = FractionFst(cardinal)\n", - " fraction_graph = fraction.fst\n", - "\n", - " ordinal = OrdinalFst(cardinal)\n", - " ordinal_graph = ordinal.fst\n", - "\n", - " decimal = DecimalFst(cardinal)\n", - " decimal_graph = decimal.fst\n", - "\n", - " measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction).fst\n", - " date_graph = DateFst(cardinal).fst\n", - " word_graph = WordFst().fst\n", - " time_graph = TimeFst().fst\n", - " money_graph = MoneyFst(cardinal, decimal).fst\n", - " whitelist_graph = WhiteListFst().fst\n", - " punct_graph = PunctuationFst().fst\n", - " electronic_graph = ElectronicFst().fst\n", - " telephone_graph = TelephoneFst().fst\n", - "\n", - " classify = (\n", - " pynutil.add_weight(whitelist_graph, 1.01)\n", - " | pynutil.add_weight(time_graph, 1.05)\n", - " | pynutil.add_weight(date_graph, 1.09)\n", - " | pynutil.add_weight(decimal_graph, 1.08)\n", - " | pynutil.add_weight(measure_graph, 1.1)\n", - " | pynutil.add_weight(cardinal_graph, 1.1)\n", - " | pynutil.add_weight(ordinal_graph, 1.1)\n", - " | pynutil.add_weight(fraction_graph, 1.09)\n", - " | pynutil.add_weight(money_graph, 1.07)\n", - " | pynutil.add_weight(telephone_graph, 1.1)\n", - " | pynutil.add_weight(electronic_graph, 1.1)\n", - " | pynutil.add_weight(word_graph, 100)\n", - " )\n", - "\n", - " punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(\" }\")\n", - " token = pynutil.insert(\"tokens { \") + classify + pynutil.insert(\" }\")\n", - " token_plus_punct = (\n", - " pynini.closure(punct + pynutil.insert(\" \")) + token + pynini.closure(pynutil.insert(\" \") + punct)\n", - " )\n", - "\n", - " graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)\n", - " graph = delete_space + graph + delete_space\n", - "\n", - " self.fst = graph.optimize()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qc4B_0rNcQZu" - }, - "source": [ - "## FAR import/export" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0nRRPvy-AYsA" - }, - "source": [ - "While working through these code excerpts, you may have noticed some latency with each instantiation of our WFSTs (notably wherever `CardinalFst` was involved). This is because the `pynini.optimize` that we call with each graph's instantiation is computationally expensive. For our ultimate purpose of deployment, it seems a waste of resources to recreate stable graphs for each use.\n", - "\n", - "To address this, NeMo ITN supports WFST caching through use of `pynini.Far`, storing and recovering Classify grammars as FAR (Fst ARchives).\n", - "\n", - "Let us update our `ClassifyFst` to permit passing a cache and allowing overwriting (for development):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5XgWevUzD1AE" - }, - "outputs": [], - "source": [ - "class ClassifyFst(GraphFst):\n", - " def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):\n", - " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "l28GMR70ESz0" - }, - "source": [ - "For storing our graphs as FARs, we can use `graph_utils.generator_main`, which saves our WFSTs by type for easier management. For arguments it takes a string name and a dict mapping of WFST type to graph:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "AzTkcmAWFLYm" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "class ClassifyFst(GraphFst):\n", - " def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):\n", - " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")\n", - " # Grammar here\n", - " # ....\n", - " if cache_dir is not None and cache_dir != \"None\":\n", - " os.makedirs(cache_dir, exist_ok=True)\n", - " far_file = os.path.join(cache_dir, \"_fr_itn.far\")\n", - " generator_main(far_file, {\"tokenize_and_classify\": self.fst})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Wz8wjCQSD6eJ" - }, - "source": [ - "We pair this with the ability to load from cache (note the `\"tokenize_and_classify\"` key being passed):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FRFYgMmuD_53" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "class ClassifyFst(GraphFst):\n", - " def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):\n", - " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")\n", - " if not overwrite_cache and far_file and os.path.exists(far_file):\n", - " self.fst = pynini.Far(far_file, mode=\"r\")[\"tokenize_and_classify\"]\n", - " else:\n", - " # Grammar here\n", - " # ....\n", - " if cache_dir is not None and cache_dir != \"None\":\n", - " os.makedirs(cache_dir, exist_ok=True)\n", - " far_file = os.path.join(cache_dir, \"_fr_itn.far\")\n", - " generator_main(far_file, {\"tokenize_and_classify\": self.fst})\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ib9nggZxF38s" - }, - "source": [ - "Producing our `ClassifyFst` as:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "d2BZyx6sGGg2" - }, - "outputs": [], - "source": [ - "class ClassifyFst(GraphFst):\n", - " def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):\n", - " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")\n", - "\n", - " far_file = None\n", - " if cache_dir is not None and cache_dir != \"None\":\n", - " os.makedirs(cache_dir, exist_ok=True)\n", - " far_file = os.path.join(cache_dir, \"_fr_itn.far\")\n", - " if not overwrite_cache and far_file and os.path.exists(far_file):\n", - " self.fst = pynini.Far(far_file, mode=\"r\")[\"tokenize_and_classify\"]\n", - " else:\n", - " cardinal = CardinalFst()\n", - " cardinal_graph = cardinal.fst\n", - "\n", - " ordinal = OrdinalFst(cardinal)\n", - " ordinal_graph = ordinal.fst\n", - "\n", - " decimal = DecimalFst(cardinal)\n", - " decimal_graph = decimal.fst\n", - "\n", - " whitelist_graph = WhiteList().fst\n", - " word_graph = WordFst().fst\n", - " time_graph = TimeFst().fst\n", - " money_graph = MoneyFst(cardinal, decimal).fst\n", - " whitelist_graph = WhiteListFst().fst\n", - " punct_graph = PunctuationFst().fst\n", - "\n", - " classify = (\n", - " pynutil.add_weight(time_graph, 1.1)\n", - " | pynutil.add_weight(whitelist_graph, 1.01)\n", - " | pynutil.add_weight(decimal_graph, 1.09)\n", - " | pynutil.add_weight(cardinal_graph, 1.1)\n", - " | pynutil.add_weight(ordinal_graph, 1.09)\n", - " | pynutil.add_weight(money_graph, 1.08)\n", - " | pynutil.add_weight(word_graph, 100)\n", - " )\n", - "\n", - " punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(\" }\")\n", - " token = pynutil.insert(\"tokens { \") + classify + pynutil.insert(\" }\")\n", - " token_plus_punct = (\n", - " pynini.closure(punct + pynutil.insert(\" \")) + token + pynini.closure(pynutil.insert(\" \") + punct)\n", - " )\n", - "\n", - " graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)\n", - " graph = delete_space + graph + delete_space\n", - "\n", - " self.fst = graph.optimize()\n", - "\n", - " if far_file:\n", - " generator_main(far_file, {\"tokenize_and_classify\": self.fst})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nEhY6wKKtfhn" - }, - "source": [ - "You should find the caching to vastly speed up compilingtime." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rTtCnC5w95CI" - }, - "source": [ - "# Verbalize and Verbalize Final " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "H9y5yuk1HaGj" - }, - "source": [ - "Our last step is to create a universal Verbalizer for all classes. This is very similar to development of `ClassifierFst`, except that the Verbalizer breaks its normalization task into two components:\n", - "- `VerbalizeFst`, which removes formatting for each token\n", - "- `VerbalizeFinalFst`, which extends `VerbalizeFst` across all tokens in a string\n", - "Why two componenets when `tokenize_and_classify` was one? Because Sparrowhawk performs all the functionality of `VerbalizeFinalFst`, so its inclusion would break deployment. However, without it, your NeMo grammar would be unable to function at base. So we separate the two to allow the best of both world." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vUawTJVuH8iR" - }, - "source": [ - "## VerbalizeFst" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xghiBV06IIWU" - }, - "source": [ - "Much like `ClassifyFst`, `VerbalizeFst` instantiates all its subgraphs and then joins them together under a union operation. However, it does not need to employ weighting. Why? Because `ClassifyFst` has assigned each token a specific class. As each class is unique, there is no possibility that a subgraph will be employed for the wrong token.\n", - "\n", - "As such, our `VerbalizeFst` is formed by a simple union operation across all previous Verbalizer graphs:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uMVCqCvsIt2v" - }, - "outputs": [], - "source": [ - "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.cardinal import CardinalFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.decimal import DecimalFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.money import MoneyFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.ordinal import OrdinalFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.time import TimeFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.whitelist import WhiteListFst\n", - "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.word import WordFst\n", - "\n", - "class VerbalizeFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"verbalize\", kind=\"verbalize\")\n", - " cardinal = CardinalFst()\n", - " cardinal_graph = cardinal.fst\n", - " ordinal_graph = OrdinalFst().fst\n", - " decimal = DecimalFst()\n", - " decimal_graph = decimal.fst\n", - " whitelist_graph = WhiteListFst().fst\n", - " money_graph = MoneyFst(decimal=decimal).fst\n", - " time_graph = TimeFst().fst\n", - " graph = (\n", - " time_graph\n", - " | whitelist_graph\n", - " | money_graph\n", - " | ordinal_graph\n", - " | decimal_graph\n", - " | cardinal_graph\n", - " )\n", - " self.fst = graph" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Wap-LU6EI2Iu" - }, - "source": [ - "## Verbalize Final" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TYaEt_0tI47t" - }, - "source": [ - "With `VerbalizeFst` complete, we now extend our graph to cover any series of tokens. All this requires is deletion of the `tokens` formatting (note the absence of such in our previous graph) and use of closure for any series of one or more tokens.\n", - "\n", - "This provides the following graph:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "L-9lJNE6JPCW" - }, - "outputs": [], - "source": [ - "\n", - "class VerbalizeFinalFst(GraphFst):\n", - " def __init__(self):\n", - " super().__init__(name=\"verbalize_final\", kind=\"verbalize\")\n", - " verbalize = VerbalizeFst().fst\n", - " word = WordFst().fst\n", - " types = verbalize | word\n", - " graph = (\n", - " pynutil.delete(\"tokens\")\n", - " + delete_space\n", - " + pynutil.delete(\"{\")\n", - " + delete_space\n", - " + types\n", - " + delete_space\n", - " + pynutil.delete(\"}\")\n", - " )\n", - " graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space\n", - " self.fst = graph" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WwMKFw-QJVgm" - }, - "source": [ - "Unlike `ClassifyFst`, NeMo ITN does not cache `VerbalizeFst` or `VerbalizeFinalFst`. (While you are welcome to provide such functionality in your own development, keep in mind that the limited complexity of our Verbalizers makes compilingtimes less significant.)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7U21AZearZMK" - }, - "source": [ - "# Deployment " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VrSccoh9K6JK" - }, - "source": [ - "Now that we have done all the groundwork, we can finally move to deployment. This final section will just cover the minor code alterations required to call your language through NeMo ITN and deploy through Sparrowhawk. For further information on using NeMo ITN, please see [this tutorial](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Inverse_Text_Normalization.ipynb). " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0Le2aJvFIAKd" - }, - "source": [ - "## InverseNormalize" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "r2R3TUCDLi5-" - }, - "source": [ - "NeMo calls upon the `InverseNormalizer` class for all ITN tasks. Given a string and language, it will instantiate both the `ClassifierFst` and `VerbalizeFst` respective for the given language. (Note: we do not use `VerbalizeFinal` as its functions are managed by Sparrowhawk.) To make your language deployable in the general NeMo ITN system, you must designate the availability of these classes for instantiation. (For more information, see the [source code](https://github.com/NVIDIA/NeMo/blob/main/nemo_text_processing/inverse_text_normalization/inverse_normalize.py).)\n", - "\n", - "To do so requires only two changes. The first is providing a string to identify your language as an option for `parse_args` ([ISO codes are advised](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tfv4Ee3ML-Fg" - }, - "outputs": [], - "source": [ - "def parse_args():\n", - " parser = ArgumentParser()\n", - " parser.add_argument(\"input_string\", help=\"input string\", type=str)\n", - " parser.add_argument(\"--language\", help=\"language\", choices=['en', 'de', 'es', 'ru', 'fr', 'MY_LANGUAGE'], default=\"en\", type=str)\n", - " parser.add_argument(\"--verbose\", help=\"print info for debugging\", action='store_true')\n", - " parser.add_argument(\"--overwrite_cache\", help=\"set to True to re-create .far grammar files\", action=\"store_true\")\n", - " parser.add_argument(\n", - " \"--cache_dir\",\n", - " help=\"path to a dir with .far grammar file. Set to None to avoid using cache\",\n", - " default=None,\n", - " type=str,\n", - " )\n", - " return parser.parse_args()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "awVl5nAsMUTl" - }, - "source": [ - "The next is to call your `ClassifyFst` and `VerbalizeFst` from `__init__`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class InverseNormalizer(Normalizer):\n", - " def __init__(self, lang: str = 'en', cache_dir: str = None, overwrite_cache: bool = False):\n", - "\n", - " if lang == 'en':\n", - " from nemo_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import ClassifyFst\n", - " from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import (\n", - " VerbalizeFinalFst,\n", - " )\n", - " # Other languages\n", - " # ....\n", - " elif lang == 'MY_LANGUAGE':\n", - "\n", - " from nemo_text_processing.inverse_text_normalization.MY_LANGUAGE.taggers.tokenize_and_classify import ClassifyFst\n", - "\n", - " from nemo_text_processing.inverse_text_normalization.MY_LANGUAGE.verbalizers.verbalize_final import (\n", - "\n", - " VerbalizeFst,\n", - "\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TI1PuejLMxdI" - }, - "source": [ - "And you're done! NeMo will handle the rest. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xrksINQoICfj" - }, - "source": [ - "## Sparrowhawk" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rP9-dmMJSg3h" - }, - "source": [ - "Sparrowhawk is an open-source implementation of Google's Kestrel Text Normalization system. Functionally it operates similar to NeMo ITN (the two-step Classify and Verbalize functions stem from [intentional NeMo integration](https://arxiv.org/pdf/2104.05055.pdf) but is better optimized for backend deployment. \n", - "\n", - "Like the preceding section, this portion of the tutorial will highlight a few necessary edits so you may deploy your normalization system." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "u1eGMGxkVZmM" - }, - "source": [ - "### Grammar Export" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "v9dr0E-uVgoT" - }, - "source": [ - "The first step in deploying your grammar is by exporting both `ClassifyFst` and `VerbalizeFst` WFST as FAR files. This is done through `pynini_export.py`, found in `NeMo/tools/text_processing_deployment`. To allow export of your grammar, we must make the similar edits as wed did for `inverse_normalize.py`" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qtek2bMMWbMj" - }, - "source": [ - "First append your language to the list of accepted strings in `parse_args`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5pTGX9YAWiTZ" - }, - "outputs": [], - "source": [ - "\n", - "def parse_args():\n", - " parser = ArgumentParser()\n", - " parser.add_argument(\"--output_dir\", help=\"output directory for grammars\", required=True, type=str)\n", - " parser.add_argument(\"--language\", help=\"language\", choices=[\"en\", \"de\", \"es\", \"ru\", 'fr', 'MY_LANGUAGE'], type=str, default='en')\n", - " parser.add_argument(\n", - " \"--grammars\", help=\"grammars to be exported\", choices=[\"tn_grammars\", \"itn_grammars\"], type=str, required=True\n", - " )\n", - " parser.add_argument(\n", - " \"--input_case\", help=\"input capitalization\", choices=[\"lower_cased\", \"cased\"], default=\"cased\", type=str\n", - " )\n", - " parser.add_argument(\"--overwrite_cache\", help=\"set to True to re-create .far grammar files\", action=\"store_true\")\n", - " parser.add_argument(\n", - " \"--cache_dir\",\n", - " help=\"path to a dir with .far grammar file. Set to None to avoid using cache\",\n", - " default=None,\n", - " type=str,\n", - " )\n", - " return parser.parse_args()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Fm3CTmdLWlUt" - }, - "source": [ - "And then call `ClassifyFst` and `VerbalizeFinalFst` in `main`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "LANG=\"FOO\"\n", - "\n", - "if LANG == 'en':\n", - " from nemo_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import (\n", - " ClassifyFst as ITNClassifyFst,\n", - " )\n", - " from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize import (\n", - " VerbalizeFst as ITNVerbalizeFst,\n", - " )\n", - "# Other languages\n", - "# ...\n", - "elif LANG == 'MY_LANGUAGE':\n", - " from nemo_text_processing.inverse_text_normalization.MY_LANGUAGE.taggers.tokenize_and_classify import (\n", - " ClassifyFst as ITNClassifyFst,\n", - " )\n", - " from nemo_text_processing.inverse_text_normalization.MY_LANGUAGE.verbalizers.verbalize import (\n", - " VerbalizeFst as ITNVerbalizeFst,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JFgGhCMMW3UQ" - }, - "source": [ - "### Deployment" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "V8RH0aGbW41U" - }, - "source": [ - "By default, NeMo ITN is structured to allow deployment through a Docker based backend. This involved building a container from file, exporting your grammars to the container and then deploying Sparrowhawk for processing.\n", - "\n", - "NeMo automates this entire process through `export_grammars.sh`, which will automatically compile your grammars for deployment (assuming you edited `pynini_export` appropriately) and mount them in a container for you. For our purposes, `export_grammar` only requires the following arguments:\n", - "- `LANGUAGE` - the string you have used throughout to indicate your language\n", - "- `GRAMMARS` - only accepts `itn_grammars`(Inverse Text Normalization) or `tn_grammars` (Text Normalization)\n", - "\n", - "For instance, we would call our French ITN with:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KYdbawAfZIco" - }, - "source": [ - "`bash export_grammar.sh --GRAMMARS=itn_grammars --LANGUAGE={LANGUAGE}`" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UXVr2twdZMO2" - }, - "source": [ - "Which will return the Docker prompt for further normalization." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TDoVUxCE-Dax" - }, - "source": [ - "# Final Notes" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Fw-9mU7ql8iY" - }, - "source": [ - "Congratulations, you have now constructed an entire ITN system from the ground up! While your experience will vary with each language, you will find several commonalities that will assist you in further development. \n", - "\n", - "If you are interested in working further with your language WFSTs, you may wish to construct a TN system. Broadly, this is accomplished by inverting your previous graphs (`pynini.invert` may assist here) and changing your outputs to avoid indeterminacy (i.e. decide on one canonical output for your grammar for each class). But outside of such grammar specific edits, you repeat many of the steps exhibited here, such as:\n", - "- Use of a two step classifier-verbalizer system\n", - "- Same semiotic classes for tagging\n", - "- Inheritance of `GraphFst`\n", - "- Minor import edits to `pynini_export` and `export_grammar`" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "WFST Tutorial.ipynb", - "provenance": [], - "toc_visible": true - }, - "interpreter": { - "hash": "fbc643a332f9d7801191710b24a8a955d342df4f32791f7fb65121dc4784751f" - }, - "kernelspec": { - "display_name": "Python 3.9.7 64-bit ('wfst_tutorial': conda)", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "nbformat": 4, - "nbformat_minor": 1 -} \ No newline at end of file + "id": "Qq1Hz6CKWdwl", + "outputId": "3d8f5bd6-f10e-431d-9039-eb88164fbb95" + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "You can run either this notebook locally or on Google Colab.\n", + "\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Optional: Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Prerequisites:\n", + "1. Please make sure to read the [Text Processing Documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/intro.html) and [Text Normalization Introduction Tutorial](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Text_(Inverse)_Normalization.ipynb) **before** this notebook. This notebook is a in-depth tutorial on how to customize and develop your own text normalization or inverse text normalization grammars.\n", + "2. download NeMo source code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Install NeMo, which installs both nemo and nemo_text_processing package\n", + "BRANCH = 'r1.9.0'\n", + "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nemo_text_processing]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pynini\n", + "import nemo_text_processing\n", + "from pynini.lib import pynutil" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, NEMO_DIGIT, delete_space, NEMO_SIGMA, NEMO_NOT_QUOTE, delete_extra_space, NEMO_NON_BREAKING_SPACE\n", + "from nemo_text_processing.text_normalization.normalize import Normalizer\n", + "\n", + "from nemo_text_processing.inverse_text_normalization.fr.taggers.cardinal import CardinalFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.taggers.decimal import DecimalFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.taggers.money import MoneyFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.taggers.ordinal import OrdinalFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.taggers.punctuation import PunctuationFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.taggers.time import TimeFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.taggers.whitelist import WhiteListFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.taggers.word import WordFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.cardinal import CardinalFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.decimal import DecimalFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.money import MoneyFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.ordinal import OrdinalFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.time import TimeFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.whitelist import WhiteListFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.word import WordFst\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T0JxcvuPHvn9" + }, + "source": [ + "NeMo's Text Processing module uses Weighted Finite State Transducers (WFST) to deploy grammars for both efficient text normalization (TN) and inverse text normalization (ITN). In this tutorial, you will learn to build a normalization grammar from the ground up to use in your own text processing tasks. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Table of Contents\n", + "- WFSTs\n", + "- NeMo Inverse Text Processing\n", + "- Getting Started\n", + "- Cardinal WFST\n", + "- Ordinal WFST\n", + "- Decimal WFST\n", + "- Money WFST\n", + "- Time WFST\n", + "- WhiteList WFST\n", + "- Word and Punctuation WFST\n", + "- Other Classes\n", + "- Tokenize and Classify\n", + "- Verbalize and Verbalize Final\n", + "- Deployment" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lMUovcMsfXyI" + }, + "source": [ + "# WFSTs " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y1ejNMLbH1jM" + }, + "source": [ + "WFSTs are a form of [Finite State Machines](https://en.wikipedia.org/wiki/Finite-state_machine) used to graph relations between regular languages (or [regular expressions](https://en.wikipedia.org/wiki/Regular_expression)). For our purposes, they can be defined by two major properties:\n", + "\n", + "1. Mappings between accepted input and output expressions for text substitution\n", + "2. Path weighting to direct graph traversal" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nNg45ZuaP_A8" + }, + "source": [ + "For example, consider a simple normalization task of mapping the word \"cent\" (French for \"one hundred\") to the numerical representation `100`. We would begin with a Finite State representation of the regex `/cent/`:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uxo7gUkW_XKT" + }, + "source": [ + "![cent.png](images/cent.PNG)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fahsjMVFlbCa" + }, + "source": [ + "And then create a mapping to the text string `100`:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IMJ-fNSk_jXC" + }, + "source": [ + "![cent_to_100.png](images/cent_to_100.PNG)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bPKW0I4yAGUb" + }, + "source": [ + "*Note: Null characters are expressed as `ε` by convention*" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_0NK3aW5nG9C" + }, + "source": [ + "This would give us a WFST with universal path weights. (By default, `pynini` uses [tropical semirings](https://en.wikipedia.org/wiki/Tropical_semiring) for arcs, giving each arc a default weight of `0`.)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CzBc9D3qTGJ-" + }, + "source": [ + "Now, let us consider expanding our model. To indicate values between `100` and `200`, French uses the number scheme of `cent + digit`. For example, `120` would be pronounced as \"cent-vingt\". To create the appropriate output string, we would now want to map \"cent\" to `1` and the remaining aspect of our string to the appropriate digit representation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GRrKNQRjFDoL" + }, + "source": [ + "![cent_vingt_to_120.png](images/cent_vingt_to_120.PNG)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jLpm4mufAfUz" + }, + "source": [ + "However this would make our graph [non-deterministic](https://en.wikipedia.org/wiki/Nondeterministic_algorithm) - it will have multiple possibilities for termination. Now an input of \"cent-vingt\" could have the outcome of `100` or `10020` when only one is correct. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![cent_vingt_bad.png](images/cent_vingt_bad.PNG)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c-GJTpgIAf7S" + }, + "source": [ + "To correct this, we may add a new end state and a weight to the path that accepts the input without `s`:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6GJcsdttGg_S" + }, + "source": [ + "![cent_vingt_good.png](images/cent_vingt_good.PNG)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mHft1gzsAipc" + }, + "source": [ + "Now, we can guarantee an ideal mapping by relying on a shortest-path (smallest-weight) heuristic: traversal of the graph will prioritize longer inputs, only converting \"cent\" to `100` when a larger input isn't available. As such, we've now removed the undesired output `10020` while preserving our desired coverage in string mapping. \n", + "\n", + "This use of weights to ensure predictable behavior allows WFSTs to exploit the efficiency of standard graph traversal algorithms while also maintaining versatility. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8Ik4PBXafSSB" + }, + "source": [ + "# NeMo Inverse Text Processing " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b2fcWKhqYVF5" + }, + "source": [ + "Following [Google's Kestrel](https://www.researchgate.net/publication/277932107_The_Kestrel_TTS_text_normalization_system) framework, NeMo deploys two composite WFSTs for text normalization. They are as follows:\n", + "1. A *classifier* (or tagger) to label potential tokens by 'semiotic class' (e.g. currency, ordinal number, street address)\n", + "2. A *verbalizer* to render a tagged token in conventional written form\n", + "\n", + "For example, consider the sentence: <>\n", + "\n", + "For an ITN task, a tokenizer would identify the following tokens:\n", + "\n", + "`[\"le\" ,\"premier\", \"juillet\", \"il\", \"a\", \"mangé\", \"trente-cinq\", \"pommes\"]`\n", + "\n", + "and provide each a class token: \n", + "\n", + "- `tokens { name: \"le\" }`\n", + "- `tokens { date { day: \"1\" month: \"juillet\" } } ` \n", + "- `tokens { name: \"il\" }` \n", + "- `tokens { name: \"a\" }` \n", + "- `tokens { name: \"mangé\" }`\n", + "- `tokens { cardinal { integer: \"35\" } }` \n", + "- `tokens { name: \"pommes\" }`\n", + "\n", + "These tokens are then passed to a 'verbalizer' WFST, which renders each token in a conventional written form:\n", + "\n", + "- `tokens { name: \"le\" }` -> `le` \n", + "- `tokens { date { day: \"1\" month: \"juillet\" } } ` -> `1ᵉʳ` \n", + "- `tokens { name: \"il\" }` -> `juillet`\n", + "- `tokens { name: \"il\" }` -> `il` \n", + "- `tokens { name: \"a\" }` -> `a`\n", + "- `tokens { name: \"mangé\" }` -> `mangé` \n", + "- `tokens { cardinal { integer: \"35\" } }` -> `35` \n", + "- `tokens { name: \"pommes\" }` -> `pommes`\n", + "\n", + "and merged into a normalized string:\n", + "\n", + "`le 1ᵉʳ juillet il a mangé 35 pommes`\n", + "\n", + "With the equivalent TN task being the reverse process. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_n-5JExAbvwr" + }, + "source": [ + ">**_Note:_**\n", + ">A few things to note specific to inverse text normalization: \n", + ">- Each class token has a unique set of field names that must be parsed by the classifier. The default field names for NeMo are chosen to mirror the syntax in [Sparrowhawk](https://github.com/google/sparrowhawk) to enable deployment. If these fields are not exact, you will not be able to use Sparrowhawk.\n", + ">- NeMo assumes no punctuation (unless explicitly provided in the grammar) and all lower casing to ease integration with upstream ASR.\n", + ">- The `name` class token is default for any token that does not require processing. It will be left 'as is.'\n", + ">- You may note how the tokenizer performed the conversion of `premier` to `1` while the verbalizer normalized `1` -> `1ᵉʳ`. Such decisions are implementation dependent and will vary depending on preference and language. (That is, normalization from `premier` -> `1ᵉʳ` could have been a tokenization step.)\n", + ">- By default, NeMo will create several permutations of key values in a token to ease normalization. That is, given the token `tokens { date { day: \"1\" month: \"juillet\" } }`, it will also produce paths for `tokens { date { month: \"juillet\" day: \"1\" } }`. To prevent this and avoid ambiguity in verbalizer input, tokens can be assigned a `preserve_order` attribute to prevent permutation. (e.g. `tokens { date { day: \"1\" month: \"juillet\" preserve_order: true } }`) (We will discuss this [later in the tutorial](#verbalizer).)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## WFST Classes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "NeMo Text Processing's base languages supports a range of semiotic classes to permit integration with Sparrowhawk.\n", + "For this tutorial, we will be focusing on the following classes:\n", + "- CARDINAL\n", + "- ORDINAL\n", + "- DECIMAL\n", + "- MONEY\n", + "- TIME\n", + "- WHITELIST\n", + "- WORD\n", + "- PUNCTUATION\n", + "\n", + "While not comprehensive, these classes will provide enough foundation and exposure to edge cases that you will feel comfortable constructing for other cases.\n", + "\n", + "**NOTE**: *If you intend to only develop for personal use with NeMo, you may rename these classes as desired. However, Sparrowhawk integration\n", + "REQUIRES use of only these tags and their assigned attributes. For list of Sparrowhawk tokens and attributes, [consult the Sparrowhawk repository](https://github.com/yzhang123/sparrowhawk/blob/test/src/proto/semiotic_classes.proto)*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Further Reading" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you wish to learn more about NeMo Text Processing, you may wish to consult the following:\n", + "- [Y. Zhang, E. Bakhturina, K. Gorman, and B. Ginsburg, \"NeMo Inverse Text Normalization: From Development To Production\"](https://arxiv.org/pdf/2104.05055.pdf)\n", + "- [NeMo's Text Normalization Documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/intro.html) \n", + "- [NeMo's Text Normalization Deployment Documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/wfst_text_processing_deployment.html)\n", + "- NeMo's [Text Normalization Introduction Tutorial](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Text_Normalization.ipynb)\n", + "- [Sparrowhawk Documentation](https://github.com/google/sparrowhawk)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For further information regarding WFSTs, please see:\n", + "- [D. Jufasky and J. Martin, *Natural Language Processing*, Ch. 2](https://web.stanford.edu/~jurafsky/slp3/2.pdf)\n", + "- [K. Gorman and R. Sproat, *Finite-State Text Processing*](http://www.morganclaypoolpublishers.com/catalog_Orig/product_info.php?products_id=1636)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XFdXRcnUfI25" + }, + "source": [ + "# Getting Started \n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "K3Zl3VwqdYqL" + }, + "source": [ + "To begin tokenizer development, make sure you have [installed NeMo from source](https://github.com/NVIDIA/NeMo)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rGg7Bf13FXgc" + }, + "source": [ + "For this tutorial, we will focus on developing an Inverse Text Normalization system, such as one you may encounter in downstream ASR processing. As such, we will navigate to\n", + "`nemo_text_processing/inverse_text_normalization` and create a directory for our target language (French) and subdirectories\n", + "for `taggers` and `verbalizers`. You may also wish to create a `data` subdirectory to ease navigation.\n", + "\n", + "(Note, for text normalization, the suggested directory structure would be the same within the `nemo_text_processing/text_normalization` folder. In fact, many of NeMo's grammars actively share between.)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T58E4pU4FN3A" + }, + "source": [ + "```bash\n", + "git clone https://github.com/NVIDIA/NeMo\n", + "cd NeMo && ./reinstall.sh\n", + "cd nemo_text_processing/inverse_text_normalization/\n", + "export LANGUAGE=fr # Change this to your desired language\n", + "mkdir $LANGUAGE\n", + "mkdir $LANGUAGE/taggers\n", + "mkdir $LANGUAGE/verbalizers\n", + "mkdir $LANGUAGE/data\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "O1vfz-bUFpwz" + }, + "source": [ + "All WFSTs deployed in NeMo inherit from the `GraphFst` class.\n", + "While in most cases you can simply import from a pre-existing `graph_utils.py`, you may occasionally find it helpful for deployment to keep a copy \n", + "in your working directory for language specific edits. (For our purposes, we will be utilizing `nemo_text_processing.text_normalization.en.graph_utils`, which serves as default for NeMo's grammars.)\n", + "\n", + "You may also wish to keep a copy of `utils.py` (found in each language system's directory)\n", + "in your working directory to assist with pathing. (Make sure to adjust the imports towards your language.)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3OME84EmOQ4h", + "outputId": "6eea17f9-aae9-4176-ae35-3d1f0e94b4ea" + }, + "source": [ + "```bash\n", + "cp ../text_normalization/en/graph_utils.py $LANGUAGE/\n", + "cp ../text_normalization/en/utils.py $LANGUAGE/\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dependencies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For development, we utilize `nemo_text_processing` and `pynini` (a Python library for efficient WFST construction and traversal, installed with `nemo` by default). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "While this tutorial will attempt to make use of `pynini` tools transparent, it does assume some familiarity with its syntax. For a more in-depth guide, the following will provide a function overview:\n", + "\n", + "- [K. Gorman, Pynini: A Python library for weighted finite-state grammar compilation](https://aclanthology.org/W16-2409.pdf)\n", + "- [Pynini Documentation](https://www.openfst.org/twiki/bin/view/GRM/PyniniDocs) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will also import the `pynutil` module for access to some extra functionality, along with writing a simple helper function for printing `pynini` graphs through the previously discussed 'shortest-path' heuristic." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sz18Ui8-8Kf4" + }, + "outputs": [], + "source": [ + "from pynini.lib import pynutil\n", + "\n", + "def apply_fst(text, fst):\n", + " \"\"\" Given a string input, returns the output string\n", + " produced by traversing the path with lowest weight.\n", + " If no valid path accepts input string, returns an\n", + " error.\n", + " \"\"\"\n", + " try:\n", + " print(pynini.shortestpath(text @ fst).string())\n", + " except pynini.FstOpError:\n", + " print(f\"Error: No valid output with given input: '{text}'\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cardinal WFST " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rOyLZb9DgLoh" + }, + "source": [ + "The vast majority of ITN tasks require the ability to recognize and denormalize numbers. As such, we will begin with developing a Classifier and Verbalizer for Cardinal (integer) numbers. (e.g. `-3,-2,-1,0,1,2,3,4,5....99,100,101...`)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9GZQkH1V89kh" + }, + "source": [ + "## Grammar" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will begin by first constructing a Cardinal WFST, using French as an example language. While your target language will obviously differ greatly from our example, you will likely find some several similarities, such as:\n", + "- Use of a (semi) regular decimal (base-10) counting system. (A common - but not universal - feature of natural languages.)\n", + "- Incorporation of several irregularities requiring contingencies in our WFST construction. (e.g. a pseudo vigesimal (base-20) series.)\n", + "- Use of gender and number agreement in enumeration." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Digits" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NzJ2DIwc_TT3" + }, + "source": [ + "We shall begin with the first decimal place. As these numbers serve as the building blocks for the rest of our WFST, we shall begin by explicitly calling their WFST mappings with `pynini.string_map`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "u0H4qg4BjYfB" + }, + "outputs": [], + "source": [ + "zero = pynini.string_map([(\"zéro\",\"0\")]) # French only pronounces zeroes as stand alone\n", + "digits = pynini.string_map([ # pynini function that creates explicit input-output mappings for a WFST\n", + "\t\t\t\t(\"un\",\"1\"),\n", + "\t\t\t\t(\"une\",\"1\"),\n", + "\t\t\t\t(\"deux\",\"2\"),\n", + "\t\t\t\t(\"trois\",\"3\"),\n", + "\t\t\t\t(\"quatre\",\"4\"),\n", + "\t\t\t\t(\"cinq\",\"5\"),\n", + "\t\t\t\t(\"six\",\"6\"),\n", + "\t\t\t\t(\"sept\",\"7\"),\n", + "\t\t\t\t(\"huit\",\"8\"),\n", + "\t\t\t\t(\"neuf\",\"9\")\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0nHjY-NNjdWQ" + }, + "source": [ + "We may also simply write a `tsv` file in a separate data folder " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- zéro\t0\n", + "- un\t1\n", + "- une\t1\n", + "- deux\t2\n", + "- trois\t3\n", + "- quatre\t4\n", + "- cinq\t5\n", + "- six\t6\n", + "- sept\t7\n", + "- huit\t8\n", + "- neuf\t9" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xicKcZLEzQTg" + }, + "source": [ + "and import with `string_file`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`digits = pynini.string_file(\"data/digits.tsv\")`\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If utils.py is in working directory you may also use `get_abs_path`, which will always call paths relative to your {LANGUAGE} directory:\n", + "\n", + "`from nemo_text_processing.inverse_normalization.{LANGUAGE}.utils import get_abs_path`\n", + "\n", + "`digits = pynini.string_file(get_abs_path(\"data/digits.tsv\"))`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yPccmicQkYAB" + }, + "source": [ + "While we will use `string_map` throughout this tutorial, please note that NeMo employs the later option for maintainability and recommends its use instead." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Teens" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FQJiJcVMrNmC" + }, + "source": [ + "Let us consider our next set of numbers:\n", + "- 10 - dix\n", + "- 11 - onze\n", + "- 12 - douze\n", + "- 13 - treize\n", + "- 14 - quatorze\n", + "- 15 - quinze\n", + "- 16 - seize\n", + "- 17 - dix-sept\n", + "- 18 - dix-huit\n", + "- 19 - dix-neuf\n", + "\n", + "Like before, we can simply use `string_map` to compose a WFST for them. But note how there is some redundancy in the number set: `17`, `18`, and `19` are all of the form `dix + digit`. It would be more efficient simply to reuse our prior WFST in these cases than simply creating new arcs, states, and weights. \n", + "\n", + "We can achieve this using pynini's string concatenation function to extend the accepted input strings. First we will create an WFST for `11-16`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "orSgBwyXsfY5" + }, + "outputs": [], + "source": [ + "teens = pynini.string_map([\n", + "\t\t\t\t(\"onze\",\"11\"),\n", + "\t\t\t\t(\"douze\",\"12\"),\n", + "\t\t\t\t(\"treize\",\"13\"),\n", + "\t\t\t\t(\"quatorze\",\"14\"),\n", + "\t\t\t\t(\"quinze\",\"15\"),\n", + "\t\t\t\t(\"seize\",\"16\"),\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s1yIgigdtriQ" + }, + "source": [ + "Now, we will create a `tens` WFST that is responsible for mapping all instances of \"dix\" and concatenate (accomplished with the overloaded `+` operator) with the prior `digits` WFST. (Deleting any possible hyphens in-between with a build in `delete_hyphen`.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CzwZrFCkt87W" + }, + "outputs": [], + "source": [ + "tens = pynini.string_map([(\"dix\", \"1\")])\n", + "delete_hyphen = pynini.closure(pynutil.delete(\"-\"), 0, 1) # Applies a closure from 0-1 of operation. Equivalent to regex /?/\n", + "\n", + "graph_tens = tens + delete_hyphen + digits" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2knCwybmuTDn" + }, + "source": [ + "We now can combine the `teens` and `graph_tens` WFST together through the union operation (done with the overloaded `|` operator), allowing our choice of either graph." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WIRJ4PE7uRrl" + }, + "outputs": [], + "source": [ + "graph_tens_and_teens = graph_tens | teens" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TGkzKoeuxbeA" + }, + "source": [ + "Let's see if it works through the string function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "v2iD0_HnxdUV", + "outputId": "1d8f434f-ff8a-4c85-b8d0-1127e4587ddf" + }, + "outputs": [], + "source": [ + "apply_fst(\"dix-huit\", graph_tens_and_teens)\n", + "apply_fst(\"seize\", graph_tens_and_teens)\n", + "apply_fst(\"dix\", graph_tens_and_teens)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Yh2f-3rux8_2" + }, + "source": [ + "The first two worked, but why did we get an error with \"dix\"? If you look back, you'll notice that while our graph has a mapping from \"dix\" to `1` - the concatenation with `digits` makes the assumption that some input from those strings will follow. That is, we left no opportunity for an *omission* of `digits`.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OM_eJYlV1UVp" + }, + "source": [ + "![dix_to_digits.png](images/dix_to_digits.PNG)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "M4xCMKRA1Wzw" + }, + "source": [ + "You may also note that this issue would hold also if we wanted to normalize only digits - our graph would error out since it's expecting a `tens` or input first. \n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XJHnlJCm1dPv" + }, + "source": [ + "We can fix both of these problems by allowing an option to simply insert a zero without any extra input. (Much like our \"cent\" example.)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9_vvJ9Bl1dYQ" + }, + "source": [ + "![dix_to_digits_with_insert.png](images/dix_to_digits_with_insert.PNG)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hJq3uoMN2OcC" + }, + "source": [ + "This may be accomplished through use of the `pynutil.insert` function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7h9xuNfA081P" + }, + "outputs": [], + "source": [ + "graph_digits = digits | pynutil.insert(\"0\") # inserts zero if no digit follows" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fA_L_6Ky2SHm" + }, + "source": [ + "And for `graph_tens`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jelVA81o2RXu" + }, + "outputs": [], + "source": [ + "tens = tens | pynutil.insert(\"0\") | tens + delete_hyphen\n", + "graph_tens = tens + graph_digits" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Gb5uhpGr3I4X" + }, + "source": [ + "Bringing everything together:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bLkDddkA3Stu" + }, + "outputs": [], + "source": [ + "graph_teens_and_tens = graph_tens | teens\n", + "graph_all = graph_teens_and_tens | zero " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DESDKScv3r3P" + }, + "source": [ + "Let us now check our tests:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7wrDNXuD3oh9", + "outputId": "661d2526-5aa0-4640-9285-bca15cd56c75" + }, + "outputs": [], + "source": [ + "apply_fst(\"dix-huit\", graph_all) \n", + "apply_fst(\"seize\" , graph_all)\n", + "apply_fst(\"dix\" , graph_all) \n", + "apply_fst(\"une\" , graph_all) \n", + "apply_fst(\"trois\" , graph_all) \n", + "apply_fst(\"quatre\" , graph_all) \n", + "apply_fst(\"zéro\" , graph_all)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Tz_k3NoB66Bv" + }, + "source": [ + "Now we have no more error - albeit at the cost of leading zeroes. (We will take care of this later in the section.)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tens" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2dJZAhE57an3" + }, + "source": [ + "Now that we've taken care of the teens, we can proceed with the rest of the tens. Like many languages, French employs a (fairly) regular schema of: `tens_digit + ones_digit` for 20-100. Indeed, we can summarize 20-69 in the following template:\n", + "\n", + "- 20 - vingt\n", + "- 21 - vingt-et-un\n", + "- 22 - vingt-deux\n", + "- 23 - vingt-trois\n", + "- 24 - vingt-quatre\n", + "- 25 - vingt-cinq\n", + "- 26 - vingt-six\n", + "- 27 - vingt-sept\n", + "- 28 - vingt-huit\n", + "- 29 - vingt-neuf\n", + "- 30 - trente\n", + "- 31 - trente-et-un\n", + "- 32 - trente-deux\n", + "- 33 - trente-trois\n", + "...\n", + "- 40 - quarante\n", + "...\n", + "- 50 - cinquante\n", + "...\n", + "- 60 - soixante\n", + "..." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BuaxVG35UKcs" + }, + "source": [ + "Expanding `tens` is fairly easy to accommodate this template: we simply extend our earlier `string_map` for the new terms in the 'tens place.' From there, we once again concatenate the `digits` WFST (along with a simple WFST to delete the occurrence of the \"-et-\" term that occasionally occurs.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qAnXlRkR32wt" + }, + "outputs": [], + "source": [ + "tens = pynini.string_map([\n", + "\t\t\t\t(\"dix\", \"1\"),\n", + "\t\t\t\t(\"vingt\",\"2\"),\n", + "\t\t\t\t(\"trente\",\"3\"),\n", + "\t\t\t\t(\"quarante\",\"4\"),\n", + "\t\t\t\t(\"cinquante\",\"5\"),\n", + "\t\t\t\t(\"soixante\",\"6\"),\n", + "\t\t])\n", + "\n", + "graph_et = pynutil.delete(\"-et-\")\n", + "\n", + "tens = tens | pynutil.insert(\"0\") | tens + pynutil.delete(\"-\") | tens + graph_et\n", + "\n", + "graph_tens = tens + graph_digits\n", + "graph_teens_and_tens = graph_tens | teens\n", + "graph_all = graph_teens_and_tens | zero " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-hJwqPDx8I2R" + }, + "source": [ + "#### Special Cases: 70-99" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zvBLvJdY9XPA" + }, + "source": [ + "However, things get tricky once we go beyond the 60s. Here, standard French possesses a notorious psuedo-vigecimal (base-20) system. For numbers 70-99:\n", + "\n", + "- 70 - soixante-dix <- Literally in English: \"sixty-ten\"\n", + "- 71 - soixante-et-onze <- Literally in English: \"sixty-and-eleven\"\n", + "- 72 - soixante-douze\n", + "- 73 - soixante-treize\n", + "- 74 - soixante-quatorze\n", + "- 75 - soixante-quinze\n", + "- 76 - soixante-seize\n", + "- 77 - soixante-dix-sept\n", + "- 78 - soixante-dix-huit\n", + "- 79 - soixante-dix-neuf\n", + "- 80 - quatre-vingts <- Literally in English: \"four-twenties\"\n", + "- 81 - quatre-vingt-un\n", + "- 82 - quatre-vingt-deux\n", + "- 83 - quatre-vingt-trois\n", + "- 84 - quatre-vingt-quatre\n", + "- 85 - quatre-vingt-cinq\n", + "- 86 - quatre-vingt-six\n", + "- 87 - quatre-vingt-sept\n", + "- 88 - quatre-vingt-huit\n", + "- 89 - quatre-vingt-nuef\n", + "- 90 - quatre-vingt-dix <- Literally in English: \"four-twenties-ten\"\n", + "- 91 - quatre-vingt-onze\n", + "- 92 - quatre-vingt-douze\n", + "- 93 - quatre-vingt-treize\n", + "- 94 - quatre-vingt-quatorze\n", + "- 95 - quatre-vingt-quinze\n", + "- 96 - quatre-vingt-seize\n", + "- 97 - quatre-vingt-dix-sept\n", + "- 98 - quatre-vingt-dix-huit\n", + "- 99 - quatre-vingt-dix-neuf" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HQNiwFDyVV_3" + }, + "source": [ + "As before, we want to take advantage of as much redundancy as we can without creating additional ambiguities that will impede graph traversal. \n", + "\n", + "We first note that - despite repeating prior words - \"quatre-vingt\" can be mapped to `8` without introducing ambiguity. This is because, despite \"quatre\" and \"vingt\" being present in our prior graphs, our WFST has no pathing for them in this exact order. As such, we can simply add it to `tens` and immediately improve our coverage for 81-89. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "AvJqaHhE9Wbd" + }, + "outputs": [], + "source": [ + "tens = pynini.string_map([\n", + "\t\t\t\t(\"dix\", \"1\"),\n", + "\t\t\t\t(\"vingt\",\"2\"),\n", + "\t\t\t\t(\"trente\",\"3\"),\n", + "\t\t\t\t(\"quarante\",\"4\"),\n", + "\t\t\t\t(\"cinquante\",\"5\"),\n", + "\t\t\t\t(\"soixante\",\"6\"),\n", + " (\"quatre-vingt\", \"8\")\n", + "\t\t])\n", + "tens = tens | pynutil.insert(\"0\") | tens + delete_hyphen | tens + graph_et\n", + "graph_tens = tens + graph_digits\n", + "graph_teens_and_tens = graph_tens | teens\n", + "graph_all = graph_teens_and_tens | zero " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0_DtcpZxZTzX" + }, + "source": [ + "Of course, now we permit the occurrence of:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "V2leANlDhCvj", + "outputId": "db8d5d02-c848-4e50-df23-d8499538281c" + }, + "outputs": [], + "source": [ + "apply_fst(\"quatre-vingt\", graph_all)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_X_ef3sihCHH" + }, + "source": [ + "which is invalid (French uses the plural \"quatre-vingt**s**\" here.) " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vgKT903Y6rIQ" + }, + "source": [ + "Should we alter the grammar because of this? Such a decision will largely be dependent on your intended implementation and design aims. If you see the question of 'legal' tokens as a responsibility of your upstream model, then there is no need for any alteration: \"quatre-vingt\" as a standalone token will simply not occur, so there is no input to be concerned with.\n", + "\n", + "However, if your ITN grammars are developed for an environment with low-fidelity ASR and/or where mistaken transcriptions incur heavy loss (e.g. ASR for driving directions, telephone-numbers, banking) then you may wish to err on the side of caution." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Hf_FghLT7jdY" + }, + "source": [ + "If we wanted to go for the latter, we would want to mark that \"quatre-vingts\" maps **only** to `80`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JliFTF3mZSsJ" + }, + "outputs": [], + "source": [ + "quatre_vingt_plural = pynini.string_map([\n", + " (\"quatre-vingts\", \"80\")\n", + "\t\t])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "81_b3XPbicT1" + }, + "source": [ + "And that \"quatre vingt\" can only accompany non-zero digits:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "E4_dmg6uin2j" + }, + "outputs": [], + "source": [ + "quatre_vingt_singular = pynini.string_map([\n", + " (\"quatre-vingt-\", \"8\") # Note that the hyphen can be assumed now\n", + "\t\t])\n", + "graph_digits_without_zero = pynini.string_map([\n", + "\t\t\t\t(\"un\",\"1\"),\n", + "\t\t\t\t(\"une\",\"1\"),\n", + "\t\t\t\t(\"deux\",\"2\"),\n", + "\t\t\t\t(\"trois\",\"3\"),\n", + "\t\t\t\t(\"quatre\",\"4\"),\n", + "\t\t\t\t(\"cinq\",\"5\"),\n", + "\t\t\t\t(\"six\",\"6\"),\n", + "\t\t\t\t(\"sept\",\"7\"),\n", + "\t\t\t\t(\"huit\",\"8\"),\n", + "\t\t\t\t(\"neuf\",\"9\")\n", + "])\n", + "graph_eighties = (quatre_vingt_singular + graph_digits_without_zero) | quatre_vingt_plural" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mL7jpekV8VgP" + }, + "source": [ + "For the `70`'s and `90`'s, we would likewise need to form exclusive configurations for their number series, rewriting digits to recognize \"onze\", \"douze\", \"treize\"... as `1,2,3....` (Note, we'll have to separate `71` and `91` to manage \"soixante-**et**-onze\" vs. \"quatre-vingt-onze\".)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y3dYkwK29zCX" + }, + "outputs": [], + "source": [ + "seventy_and_ninety = pynini.string_map([\n", + " (\"soixante-dix\", \"70\"),\n", + " (\"quatre-vingt-dix\", \"90\"),\n", + "\t\t])\n", + "\n", + "seventy_and_ninety_tens = pynini.string_map([\n", + " (\"soixante-\", \"7\"),\n", + " (\"quatre-vingt-\", \"9\"),\n", + "\t\t])\n", + "\n", + "seventy_and_ninety_one = pynini.string_map([\n", + " (\"soixante-et-onze\", \"71\"),\n", + " (\"quatre-vingt-onze\", \"91\"),\n", + "\t\t])\n", + "\n", + "seventy_and_ninety_digits = digits = pynini.string_map([ \n", + "\t\t\t\t(\"douze\",\"2\"),\n", + "\t\t\t\t(\"treize\",\"3\"),\n", + "\t\t\t\t(\"quatorze\",\"4\"),\n", + "\t\t\t\t(\"quinze\",\"5\"),\n", + "\t\t\t\t(\"seize\",\"6\"),\n", + "\t\t\t\t(\"dix-sept\",\"7\"), # For 97-99, digits are used as normal.\n", + "\t\t\t\t(\"dix-huit\",\"8\"),\n", + "\t\t\t\t(\"dix-neuf\",\"9\")\n", + "])\n", + "\n", + "graph_seventies_and_nineties = (seventy_and_ninety_tens + seventy_and_ninety_digits) | seventy_and_ninety | seventy_and_ninety_one " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4NCrCwEH9HVg" + }, + "source": [ + "Now we union them with our original `tens` series:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "psGCgxaH-btn" + }, + "outputs": [], + "source": [ + "tens = pynini.string_map([\n", + "\t\t\t\t(\"dix\", \"1\"),\n", + "\t\t\t\t(\"vingt\",\"2\"),\n", + "\t\t\t\t(\"trente\",\"3\"),\n", + "\t\t\t\t(\"quarante\",\"4\"),\n", + "\t\t\t\t(\"cinquante\",\"5\"),\n", + "\t\t\t\t(\"soixante\",\"6\"),\n", + "\t\t])\n", + "tens = tens | pynutil.insert(\"0\") | tens + delete_hyphen | tens + graph_et\n", + "\n", + "graph_tens = tens + graph_digits\n", + "graph_tens_with_special_cases = graph_tens | graph_seventies_and_nineties | graph_eighties\n", + "graph_teens_and_tens = graph_tens_with_special_cases | teens\n", + "graph_all = graph_teens_and_tens | zero " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xWjSAGRX_s0H" + }, + "source": [ + "Making sure test cases work:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kapWmgos-xcn", + "outputId": "5e9c6f5c-1450-495f-cadf-2945355b651c" + }, + "outputs": [], + "source": [ + "apply_fst(\"quatre-vingt-treize\" , graph_all)\n", + "apply_fst(\"quatre-vingts\", graph_all)\n", + "apply_fst(\"quatre-vingt-deux\", graph_all)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hNUepfKZ_vS_" + }, + "source": [ + "And the other cases fail as expected:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wo2pCOXGAgYn", + "outputId": "0bbe2792-8bc9-40f7-dd28-4745bd1390e3" + }, + "outputs": [], + "source": [ + "apply_fst(\"quatre-vingt\", graph_all)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4VPuCTTtigh-" + }, + "source": [ + "Of course, there are other ways we could have reconfigured the grammar: we could simply make specific graphs for multiples of ten (`10,20,30..`) and all cases where \"-et-\" occurs (`21,31,41,51...91`). \n", + "\n", + "But this ignores a more important question: was any of this necessary in the first place? All these extra grammars did was simply expand coverage for thirty additional cardinals. And they still didn't exclude all faulty inputs! Note the following cases:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KICvpeewCFyH", + "outputId": "174dd910-7329-4a5f-a5b0-5e796a174217" + }, + "outputs": [], + "source": [ + "apply_fst(\"dix-une\", graph_all) # supposed to be \"onze\"\n", + "apply_fst(\"dix-deux\", graph_all) # supposed to be \"douze\"\n", + "apply_fst(\"vingt-un\", graph_all) # supposed to be \"vingt-et-un\"\n", + "apply_fst(\"trente-un\", graph_all) # supposed to be \"trente-et-un\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0D130jIVCLp2" + }, + "source": [ + "We *still* need to address possible edge cases!\n", + "\n", + "All of this is to say that knowing your input domain before construction is imperative, as small decisions can easily determine your output range later down the line.\n", + "\n", + "Indeed, if you're particularly concerned with limiting input possibilities, it may be valid simply to write all unique options within a `string_map`. While a tad inelegant, it certainly assists in controlling your outputs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RSp9w5ayA9ii" + }, + "outputs": [], + "source": [ + "graph_tens_special = pynini.string_map([\n", + "\t\t\t\t(\"soixante-dix\", \"70\"),\n", + "\t\t\t\t(\"soixante-et-onze\",\"71\"),\n", + " (\"soixante-douze\",\"72\"),\n", + "\t\t\t\t(\"soixante-treize\",\"73\"),\n", + "\t\t\t\t(\"soizante-quatorze\",\"74\"),\n", + "\t\t\t\t(\"soixante-quinze\",\"75\"),\n", + "\t\t\t\t(\"soixante-seize\",\"76\"),\n", + " (\"soixante-dix-sept\",\"77\"),\n", + " (\"soixante-dix-huit\",\"78\"),\n", + "\t\t\t\t(\"soixante-dix-neuf\",\"79\"),\n", + " (\"quatre-vingts\", \"80\"),\n", + " (\"quatre-vingt-un\", \"81\"),\n", + " (\"quatre-vingt-une\", \"81\"),\n", + "\t\t\t\t(\"quatre-vingt-deux\",\"82\"),\n", + " (\"quatre-vingt-trois\",\"83\"),\n", + " (\"quatre-vingt-quatre\",\"84\"),\n", + " (\"quatre-vingt-cinq\",\"85\"),\n", + " (\"quatre-vingt-six\",\"86\"),\n", + " (\"quatre-vingt-sept\",\"87\"),\n", + " (\"quatre-vingt-huit\",\"88\"),\n", + " (\"quatre-vingt-neuf\",\"89\"),\n", + " (\"quatre-vingt-dix\",\"90\"),\n", + " (\"quatre-vingt-onze\",\"91\"),\n", + " (\"quatre-vingt-douze\",\"92\"),\n", + " (\"quatre-vingt-treize\",\"93\"),\n", + " (\"quatre-vingt-quatorze\",\"94\"),\n", + " (\"quatre-vingt-quinze\",\"95\"),\n", + " (\"quatre-vingt-sieze\",\"96\"),\n", + " (\"quatre-vingt-dix-sept\",\"97\"),\n", + " (\"quatre-vingt-dix-huit\",\"98\"),\n", + " (\"quatre-vingt-dix-neuf\",\"99\"),])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NUPs1qOUg-hE" + }, + "source": [ + "Which is more efficient? Once again, it is dependent on your language and implementation. If we simply visualize each graph and their number of states:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sQ9GsIkNzxsU", + "outputId": "d70ca927-9c43-4f49-846c-c181e725e011" + }, + "outputs": [], + "source": [ + "constructed_version = (graph_seventies_and_nineties | graph_eighties)\n", + "constructed_version.num_states()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Xsgdu5TYx09_", + "outputId": "5812912f-883b-42e8-afbf-3ec4a0170345" + }, + "outputs": [], + "source": [ + "string_map_version = graph_tens_special\n", + "string_map_version.num_states()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9jzn_U7s0Sit" + }, + "source": [ + "We see that their number of states (graph vertexes) are almost equal. Yet, if we use `pynini.optimize` - a method that calls a suite of WFST minimization algorithms: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7YtqhOY90iF0", + "outputId": "26f0f51b-b00d-4f5a-9b2f-330c9812666a" + }, + "outputs": [], + "source": [ + "constructed_version.optimize()\n", + "constructed_version.num_states()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "y93SqnOf0qa8", + "outputId": "74efcbfa-a272-4fc6-e36e-f1e31c6df221" + }, + "outputs": [], + "source": [ + "string_map_version.optimize()\n", + "string_map_version.num_states()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2cTdQj9L0xhl" + }, + "source": [ + "We see the latter possessing a significantly larger amount of graph vertices. \n", + "\n", + "So the decision will be dependent on your ITN needs, language, concern with efficiency, and design philosophy. Further, even decisions of language dialect will have an influence. \n", + "(e.g. Belgian, Canadian, and Swiss dialects of French will dispense with elements of the vigecimal system for the decimal schema.)\n", + "\n", + ">**_Note:_** \n", + ">while `nemo_text_processing` grammars aim to minimize invalid productions, they assume input tokens are valid strings for a target language. (e.g. The mapping of \"quatre-vingt\" to `80` is permitted since it is not likely to occur in a valid French string.)* " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "V1djCnvY3CjW" + }, + "source": [ + "For more information on optimization algorithms for WFSTs, please see:\n", + "\n", + "- [M. Mohri,\"Generic epsilon-removal and input epsilon-normalization algorithms for weighted transducers\"](https://cs.nyu.edu/~mohri/pub/ijfcs.pdf)\n", + "- [M. Mohri, \"Weighted automata algorithms\"](https://cs.nyu.edu/~mohri/pub/hwa.pdf)\n", + "- [K. Thompson, \"Programming techniques: regular expression search algorithm\"](http://www.oilshell.org/archive/Thompson-1968.pdf)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hundreds\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dqPUdVBbi6gU" + }, + "source": [ + "\n", + "Moving on to the case of three digit cardinals (\"hundreds\"), it is likely that your chosen language becomes more regular in its schema. For instance, practically all French numbers `>100` obey the following:\n", + "\n", + "- `digit_from_1_to_9 + word_for_hundred + digit_from_1_to_99`\n", + "\n", + "For example:\n", + "- `203` - \"deux-cent-trois\"\n", + "- `530` - \"cinq-cent-trente\"\n", + "- `880` - \"huit-cent-quatre-vingt\"\n", + "\n", + "As such, we can write a simple `hundreds` WFST as:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lOt-gc-FiF-X" + }, + "outputs": [], + "source": [ + "hundreds = graph_digits + delete_hyphen + pynutil.delete(\"cent\") + delete_hyphen + graph_all" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Fyn1uL_NoEiz", + "outputId": "d491680b-1b3e-4762-8470-497833b82b0e" + }, + "outputs": [], + "source": [ + "apply_fst(\"deux-cent-trois\", hundreds)\n", + "apply_fst(\"huit-cent-quatre-vingts\", hundreds)\n", + "apply_fst(\"cinq-cent-trente\" , hundreds) " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qDjq_KfnoD5C" + }, + "source": [ + "Indeed, the use of French only presents two complications:\n", + "- French uses *only* the word \"cent\" for `100`. (Instead of \"un cent\".)\n", + "- 'Pure' multiples of a hundred (`200,300,400....`) use the plural \"cents\".\n", + "\n", + "The second one is the easier of the two so let's start there. There are actually two options open to us. First, we could treat \"cents\" the same way as we did \"cent\" in the base case and simply delete it. From there, the lack of any following inputs will allow the WFST to insert the trailing zeroes as appropriate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "m2F-sumbxqLE" + }, + "outputs": [], + "source": [ + "cents = pynini.accep(\"cent\") | pynini.accep(\"cents\") # Creates a Finite State (Accep)tor, mapping inputs back to themselves\n", + "hundreds = graph_digits + delete_hyphen + pynutil.delete(cents) + delete_hyphen + graph_all" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VisQu_Etx-QB" + }, + "source": [ + "Or we can use it as a cue to 'shortcut' the WFST to immediately insert zeroes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VspiTN5Vxxjl" + }, + "outputs": [], + "source": [ + "graph_cents = pynini.cross(\"cents\", \"00\") # Creates a single input-output mapping\n", + "hundreds = graph_digits + delete_hyphen + ((pynutil.delete(\"cent\") + delete_hyphen + graph_all) | graph_cents)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "meVn5BiyyX5v" + }, + "source": [ + "For the case of solitary \"cent\", we need to make sure our output is `1` only in the case that no digit precedes the occurrence. Here we need to be confident in the structure of our WFST and that any possible ambiguity has been dealt with by this point. (Something to keep in mind as we move to the thousands.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "277Z-zLWyWAf" + }, + "outputs": [], + "source": [ + "graph_cent = pynini.cross(\"cent\", \"1\")\n", + "graph_hundreds_first_digit = (graph_digits + delete_hyphen + pynutil.delete(cents)) | graph_cent\n", + "graph_hundreds = graph_hundreds_first_digit + delete_hyphen + graph_all" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FNZlJsvS_Yvt", + "outputId": "e85ae561-e7a1-4b6a-e394-f0194fdb89e7" + }, + "outputs": [], + "source": [ + "apply_fst(\"trois-cents\", graph_hundreds) \n", + "apply_fst(\"cent\", graph_hundreds)\n", + "apply_fst(\"cent-trois\", graph_hundreds) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Thousands" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e7Dy5slLzp-K" + }, + "source": [ + "For quite a few languages, managing the WFST for the thousands place is the last aspect to figure out, as the higher powers of ten reuse the same schema. (For those working with counting systems that reserve special terms for \"ten-thousand\" (e.g. Chinese derived counting systems), you may need to extend unique coverage to the next power of ten.)\n", + "\n", + "For French, the question of thousands is rather simple: `digits_from_1_to_999 + mille + digits_from_1_to_999`\n", + "\n", + "With only the exception that any expression of one thousand drops a leading digit. \n", + "- `1,000` -> \"mille\"\n", + "- `1,001` -> \"mille-un\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "AvsnAAiPzlu_" + }, + "outputs": [], + "source": [ + "graph_one_thousand = pynini.cross(\"mille\", \"1\")\n", + "graph_many_thousand = graph_hundreds + delete_hyphen + pynutil.delete(\"mille\")\n", + "\n", + "graph_thousands = (graph_one_thousand | graph_many_thousand) + delete_hyphen + graph_hundreds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "i3m9TG7Y4tkl", + "outputId": "d3f1f81d-c463-4934-9df7-3b8f2b67798f" + }, + "outputs": [], + "source": [ + "apply_fst(\"cent-mille-deux-cents\", graph_thousands)\n", + "apply_fst(\"deux-cent-mille-deux-cents\", graph_thousands)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NoevSTZGGT17" + }, + "source": [ + "### Weighting" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "A2gcVIZM0-iv" + }, + "source": [ + "Question: will this cover all our grammar so far? (Hint: what assumptions were made about \"cent\"/\"cents\"?)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cCFtPhr1BjAc", + "outputId": "048e0d93-a4a8-4f4e-d461-bfd70e911aff" + }, + "outputs": [], + "source": [ + "apply_fst(\"deux-mille-un\", graph_thousands)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ne-7L9Cd4t-8" + }, + "source": [ + "Once again, we need to introduce the possibility of the prior power of ten not occurring in the string. There must be an option for simply inserting a string of `0` in place of the omitted \"cent\"." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iockqXdn-aG4" + }, + "source": [ + "Further, we want to be careful with how cavalier we have been with insertions. Consider the following:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bxJlSnj2-Xw3", + "outputId": "6722e5ef-8a7f-43e1-84fe-b3f5f18307e1" + }, + "outputs": [], + "source": [ + "apply_fst(\"mille-cent-un\", graph_thousands) # Should be 1101\n", + "apply_fst(\"mille-cent\", graph_thousands) # 1100" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fq5zEayA-kOx" + }, + "source": [ + "It appears that our WFST has developed a tendency to simply 'ignore' some of these higher powers. Let us return to our code for `graph_hundreds` and `graph_thousands`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "S2aV1KQ4-1iP" + }, + "outputs": [], + "source": [ + "graph_cents = pynini.cross(\"cents\", \"00\")\n", + "graph_cent = pynini.cross(\"cent\", \"1\")\n", + "graph_hundreds_first_digit = (graph_digits + delete_hyphen + pynutil.delete(cents)) | graph_cent\n", + "graph_hundreds = (graph_hundreds_first_digit + delete_hyphen | pynutil.insert(\"0\")) + graph_all \n", + "\n", + "graph_one_thousand = pynini.cross(\"mille\", \"1\")\n", + "graph_many_thousand = graph_hundreds + delete_hyphen + pynutil.delete(\"mille\")\n", + "graph_thousands = (graph_one_thousand | graph_many_thousand) + delete_hyphen + graph_hundreds" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9avwOIkk-9qt" + }, + "source": [ + "Recall that throughout we have provided options for simply inserting zeroes in the case of omitted numbers? That tendency has finally caught up with us. The use of our previous `graph_hundreds` in `graph_many_thousands` now allows our graph to insert a string of `0`'s without penalty. \n", + "\n", + "You may note that this is very similar to the \"cents\" example brought up at the beginning, presenting a similar solution. We can control this output by making it too costly to traverse unless absolutely necessary for the graph. This can be accomplished simply by appending a weight to the insertion for hundreds:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MQG3j0U8CUAQ" + }, + "outputs": [], + "source": [ + "graph_hundreds = (graph_hundreds_first_digit + delete_hyphen | pynutil.insert(\"0\", weight=.1)) + graph_all \n", + "\n", + "graph_one_thousand = pynini.cross(\"mille\", \"1\")\n", + "graph_many_thousand = graph_hundreds + delete_hyphen + pynutil.delete(\"mille\")\n", + "graph_thousands = (graph_one_thousand | graph_many_thousand) + delete_hyphen + graph_hundreds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KNHhrYZ7Ca58", + "outputId": "a7d07372-733d-4837-c1e9-1dc58ba2b87c" + }, + "outputs": [], + "source": [ + "apply_fst(\"mille-cent-un\", graph_thousands)\n", + "apply_fst(\"mille-cent\", graph_thousands)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "51yPEaf2EkbD" + }, + "source": [ + "Why choose a weight of `.1`? Quite simply: it's arbitrary. As mentioned earlier, the default graph in `pynini` is a tropical semiring, which uses the `min` function to select among two arcs for path traversal. Since all our paths so far are weight `0`, any positive value will ensure that it is a last option among path traversal. (Note, this conversely entails any negative weight path will be prioritized.)\n", + "\n", + "That we chose this number as a small value comes from a place of caution: the tropical semiring uses an additive function to calculate the total weight of an entire path to traverse a WFST. As our grammars can easily become massive, this means that small weights can have major impact down the line. Further, by constraining path weights to small values, we can have general certainty towards the maximum weight of any individual graph, allowing us to add constraints regarding maximum token length and token hierarchy. (As explained in [later sections](#classifyweights).) As such, when using weights in a localized setting, it is best to use small values to avoid unforeseen escalation. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iScKgvRxGt-B" + }, + "source": [ + "### Higher Powers\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rtHEd6OE2WSg" + }, + "source": [ + "At this point, we can propose a general heuristic with escalating to higher powers of ten: they always need a way for their absence to be accommodated in the WFST. Further, they require some weighting to prevent this absence from developing into a string of omitted values. To avoid further bumps, we'll take care of this now with `graph_thousands`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iZMN7wcE2lH5" + }, + "outputs": [], + "source": [ + "graph_one_thousand = pynini.cross(\"mille\", \"1\")\n", + "graph_many_thousand = graph_hundreds + delete_hyphen + pynutil.delete(\"mille\")\n", + "graph_thousands = (graph_one_thousand | graph_many_thousand | pynutil.insert(\"000\", weight=.001)) + delete_hyphen + graph_hundreds" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Fkc3LIH824P7" + }, + "source": [ + "\n", + "For the rest of French (and many other languages), the rest of the work is simply repeating the prior pattern for the thousands element: \n", + "`hundreds + word_for_higher_power + hundreds.....` Of course there will be some variation in this schema, but the recursion should be regular. (It is rather rare that languages appropriate unique forms for these higher counts.) " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qGnK4ARX4Nay" + }, + "source": [ + "To finish French, we can list off the following equivalent for higher powers of ten:\n", + "- `million` - \"million/millions\" \n", + "- `billion` - \"milliard/milliards\"\n", + "- `trillion` - \"billion/billions\"\n", + "\n", + "Like the \"cent/cents\" rule, these values alternate with a plural form in the case of multiples of the value. Writing them out:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sBu7-dub4vxz" + }, + "outputs": [], + "source": [ + "millions = pynini.accep(\"million\") | pynini.accep(\"millions\")\n", + "graph_millions = ((graph_hundreds + delete_hyphen + pynutil.delete(millions) + delete_hyphen) | pynutil.insert(\"000\", weight=.1) # We need three zeroes now\n", + " ) + graph_thousands" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LmMeCHXr5Bb5" + }, + "outputs": [], + "source": [ + "billions = pynini.accep(\"milliards\") | pynini.accep(\"milliard\")\n", + "graph_billions = ((graph_hundreds + delete_hyphen + pynutil.delete(billions) + delete_hyphen)| pynutil.insert(\"000\",weight=.1) # We need three zeroes now\n", + " ) + graph_millions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CIRIeQEg5B0J" + }, + "outputs": [], + "source": [ + "trillions = pynini.accep(\"billion\") | pynini.accep(\"billions\")\n", + "graph_trillions = ((graph_hundreds + delete_hyphen + pynutil.delete(trillions) + delete_hyphen) | pynutil.insert(\"000\",weight=.1) # We need three zeroes now\n", + " ) + graph_billions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sRNUPx-15J1v" + }, + "source": [ + "Bringing all together:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0dLOWm_B5SwQ" + }, + "outputs": [], + "source": [ + "graph = graph_trillions | zero" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nBFE3BrN6IPR" + }, + "source": [ + "Let's try it out:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6lWwtR1S6LI4", + "outputId": "3a6740ee-9e92-4500-c2c8-965131167e58" + }, + "outputs": [], + "source": [ + "example = \"deux-cent-milliard-quatre-million-deux-cent-quatre-vingt-onze\"\n", + "apply_fst(example, graph) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Finishing Touches" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-w3KgX6C6mff" + }, + "source": [ + "Now that we have our cardinal in place, we can take care of that stylistic issue of the leading zeroes. For this, we want to develop a 'filter' that deletes all zeroes preceding the first non-zero in the string, and leave the rest 'as is.'\n", + "\n", + "First let us create the filter by calling on `NEMO_DIGIT`- a `graph_util` WFST that only permits digits as input. With it, we'll create a WFST that will delete all leading zeroes in a sting. We then compose this (using `@`) onto our original graph, creating a new graph that accepts inputs from our original but produces only the outputs of `clean_cardinal`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 290 + }, + "id": "EA4VnRe6FO-2", + "outputId": "59e412b3-a445-4172-ee64-b0f80281a167" + }, + "outputs": [], + "source": [ + "delete_leading_zeroes = pynutil.delete(pynini.closure(\"0\")) # will delete all zeroes under closure. Equivalent to regex * operator\n", + "stop_at_non_zero = pynini.difference(NEMO_DIGIT, \"0\") # creates a graph that accepts all input-outputs from NEMO_DIGIT except 0\n", + "rest_of_cardinal = pynini.closure(NEMO_DIGIT) # accepts all digits that may follow\n", + "\n", + "clean_cardinal = delete_leading_zeroes + stop_at_non_zero + rest_of_cardinal\n", + "clean_cardinal = clean_cardinal | \"0\" # We don't want to ignore the occurrence of zero\n", + "\n", + "graph = graph @ clean_cardinal " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "piP9nqQkHpo3" + }, + "source": [ + "Now our WFST will output our numbers as normal:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dnQ9odSpIAB7" + }, + "outputs": [], + "source": [ + "apply_fst(example, graph)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Final Notes\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "p7zt8lVsK2rY" + }, + "source": [ + "We have finally formulated a grammar that will process French cardinals into numeric representation. Of course, not every grammar you write will be for French. But several of the principles we've worked through will be invaluable in your own development. Before moving on, here's a quick summary of (almost) universal points to take away for WFST construction.\n", + "- Decide at the beginning of construction the level of constraint you wish for your grammar. Is it necessary to have a specific domain or can you rely on upstream models to narrow your input possibilities for you? \n", + "- Work iteratively upwards from the smallest place value of your numeric system. This will assist you in forming building blocks for larger values. \n", + "- Always allow for the possibility of omission of previous place values. (Not every number in the thousands will contain mention of the hundreds place.)\n", + "- For each place value, consider how the sub-grammar will affect the preceding and following place values. Are there exceptions that you've built into the grammar that may become problematic later on?\n", + "- Utilize weights for default insertions to limit path traversal to only final options. When doing so, use small values to avoid escalating problems in your larger grammar." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nvyHg1bQIIHD" + }, + "source": [ + "With that handled, we can move on to converting this grammar into a Classifier." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gJ1YJUvhIZwm" + }, + "source": [ + "## Classifier" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "q2L2x0crIeXQ" + }, + "source": [ + "Now that we have a grammar that will convert individual tokens into number strings, we now want to focus on building it into a classifier to properly tag candidate tokens. This requires a couple of properties:\n", + "- It recognizes any valid token and permits traversal through the WFST graph\n", + "- Conversely, it does not allow invalid tokens to traverse the WFST graph\n", + "- It properly disambiguates overlap among ambiguous cases\n", + "- It attributes the proper attributes to a classified token\n", + "\n", + "While this seems like a lot, in practice this just means that your grammar will need a few more tweaks to improve exclusivity." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ArEYn7RWKcYI" + }, + "source": [ + "NeMo ITN performs token classification through a series of `GraphFst` classes and assumes deployment of your grammars through an object that inherits from this class. As such, you will need to instantiate your grammar as a `CardinalFST` " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 368 + }, + "id": "GWgMSybqLqiS", + "outputId": "597c00ae-0f62-417f-888c-88c81c24a3fc" + }, + "outputs": [], + "source": [ + "class CardinalFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"cardinal\", kind=\"classify\")\n", + " # Rest of the grammar here\n", + " # ....... \n", + " #........." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SIE8dNQlL52G" + }, + "source": [ + "While the naming convention may vary, the `name` and `kind` properties must be set accordingly to permit Sparrowhawk integration.\n", + "\n", + "Further, the resulting graph must produce the classified token within the following format:\n", + "`token { cardinal { integer: \"DIGIT_STRING\" } }`\n", + "\n", + "This is accomplished by a series of string insertions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "aC_c64KSNTCg" + }, + "outputs": [], + "source": [ + "class CardinalFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"cardinal\", kind=\"classify\")\n", + " # Rest of the grammar here\n", + " # ....... \n", + " #.........\n", + " self.fst = pynutil.insert(\"integer: \\\"\") + graph + pynutil.insert(\"\\\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AGLQxOSzOK1F" + }, + "source": [ + "Followed by a call of the parent `GraphFst.add_tokens()` method:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Jz-UXFipORps" + }, + "outputs": [], + "source": [ + "class CardinalFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"cardinal\", kind=\"classify\")\n", + " # Rest of the grammar here\n", + " # ....... \n", + " #.........\n", + " self.fst = pynutil.insert(\"integer: \\\"\") + graph + pynutil.insert(\"\\\"\")\n", + " final_graph = self.add_tokens(graph)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gh23S7BHOY0r" + }, + "source": [ + "Which will insert the appropriate formatting. Note that this formatting must be exact: a single space must follow each field name and each value must be within escaped double quotes.\n", + "\n", + "In the event that you also wish for `CardinalFst` to indicate negative values, the optional `negative: ` property may be used.\n", + "\n", + "For instance, French indicates negative values by prefacing the quantity with \"moins.\" As such:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3JbTn35cOx0k" + }, + "outputs": [], + "source": [ + "optional_minus_graph = pynini.closure(\n", + " pynutil.insert(\"negative: \") + pynini.cross(\"moins\", \"\\\"-\\\"\") + \" \", 0, 1 # Note the extra space to separate the value from the integer field\n", + ")\n", + "\n", + "final_graph = optional_minus_graph + pynutil.insert(\"integer: \\\"\") + graph + pynutil.insert(\"\\\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DCs1048v6N0K" + }, + "source": [ + "All together, your `CardinalFst` ultimately serves as a wrapper for your grammar, save with the addition of a few insertions to assist processing:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eo6uEz1s5TJY" + }, + "outputs": [], + "source": [ + "class CardinalFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"cardinal\", kind=\"classify\")\n", + " \n", + " ### Cardinal Grammar....\n", + " ### .....\n", + " graph = graph_trillions | zero \n", + "\n", + " ### Formatting grammar....\n", + " ### .....\n", + " graph = graph @ clean_cardinal\n", + "\n", + " ### Token insertion\n", + " optional_minus_graph = pynini.closure(\n", + " pynutil.insert(\"negative: \") + pynini.cross(\"moins\", \"\\\"-\\\"\") + \" \", 0, 1\n", + " )\n", + "\n", + " final_graph = optional_minus_graph + pynutil.insert(\"integer: \\\"\") + graph + pynutil.insert(\"\\\"\")\n", + "\n", + " final_graph = self.add_tokens(final_graph) # inserts the cardinal tag\n", + "\n", + " self.fst = final_graph" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MFIMdLCoZzLK" + }, + "source": [ + "Let's see a demonstration. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4CF6Iz9NZ7R_" + }, + "outputs": [], + "source": [ + "cardinal = CardinalFst().fst\n", + "\n", + "example = \"moins deux-cent-quatre\"\n", + "\n", + "apply_fst(example, cardinal)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Verbalizer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uvUqpC_Q8FSt" + }, + "source": [ + "The verbalizer can be both the most crucial and simplest part of building each grammar. On one hand, it is the component that finalizes all of your previous work. If it is unable to properly normalize your text, everything has been for naught.\n", + "\n", + "On the other hand, your previous work has vastly limited the unpredictability of your input. Recall from our initial demonstration of the classifier-verbalizer system that and input like <> becomes:\n", + "\n", + "- `tokens { name: \"le\" }`\n", + "- `tokens { date { day: \"1\" month: \"juillet\" }` \n", + "- `tokens { name: \"il\" }` \n", + "- `tokens { name: \"a\" }` \n", + "- `tokens { name: \"mangé\" }`\n", + "- `tokens { cardinal { integer: \"35\" } }` \n", + "- `tokens { name: \"pommes\" }`\n", + "\n", + "Part of the purpose of the two stage set-up is that the input space for each verbalizer is obvious: it's simply the name of its semiotic class. As such, we only need to write our grammar to recognize its class, remove tokens accordingly, and then manage the attributes of each semiotic token." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "et1GgmBuAWzY" + }, + "source": [ + "We will begin as we did with our classifier and create a class to inherit from the `GraphFST` utility class:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NNKpgWtkAgEW" + }, + "outputs": [], + "source": [ + "class CardinalFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"cardinal\", kind=\"verbalize\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OyAV39NsAqSN" + }, + "source": [ + "One of the useful aspects of the `GraphFst` utility is that it already possesses a built in graph that will recognize and remove semiotic tokens: `delete_tokens`. As such we need only concern ourselves with managing the properties of the Cardinal class:\n", + "- `integers`\n", + "- `negative`\n", + "\n", + "Here, the desired written format of your chosen language will dictate how you proceed. For French, we have the following rules for Cardinal numbers:\n", + "- A negative sign is written before the numeral.\n", + "- Cardinal numbers representing quantities (e.g. \"mille euros\"/ \"one thousand dollars\") are written with spaces in-between every three digits. (e.g. `1 000`)\n", + "- Cardinal numbers representing place in a sequence or addresses (\"page mille\"/\"page one thousand\") are written without spacing. (`1000`)\n", + "\n", + "The first property seems easy enough to handle: write a grammar that simply removes the `negative` formatting, leaving only `-`. (Recall that our Classifier only inserted the string if it was present.) \n", + "\n", + "For the final two, we may note that our intention to develop WFSTs for the Decimal, Measure, and Money classes already will cover most desired quantities. As such, we can leave the issue of spacing to those instances and let the Cardinal WFST default to the non-spacing case. (Note that this will be helpful with Time, Date, Telephone, Electronic, and Ordinal classes as they will not use the spacing format either. It is usually better to reserve specific formatting rules to other classes and let the Cardinal serve as a default.)\n", + "\n", + "As such, we just need our WFST to remove the `integer` property and `negative` property (if it occurs). These can be managed through the `pynutil.delete` function, as seen in the following:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 368 + }, + "id": "6MF2I6SLU7nf", + "outputId": "0437c4af-5c96-4122-8af0-ca37723c7228" + }, + "outputs": [], + "source": [ + "class CardinalFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"cardinal\", kind=\"verbalize\")\n", + " \n", + " # Removes the negative attribute and leaves the sign if occurs\n", + " optional_sign = pynini.closure(\n", + " pynutil.delete(\"negative:\")\n", + " + delete_space\n", + " + pynutil.delete(\"\\\"\")\n", + " + pynini.accep(\"-\")\n", + " + pynutil.delete(\"\\\"\")\n", + " + delete_space,\n", + " 0,\n", + " 1,\n", + " )\n", + " \n", + " # removes integer aspect\n", + " graph = (\n", + " pynutil.delete(\"integer:\")\n", + " + delete_space\n", + " + pynutil.delete(\"\\\"\")\n", + " + pynini.closure(NEMO_DIGIT, 1) # Accepts at least one digit\n", + " + pynutil.delete(\"\\\"\")\n", + " )\n", + " \n", + " graph = optional_sign + graph # concatenates two properties\n", + "\n", + " delete_tokens = self.delete_tokens(graph) # removes semiotic class tag\n", + "\n", + " self.fst = delete_tokens.optimize()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QSX2KlZJbRAA" + }, + "source": [ + "Let's see if it will properly render a given token:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JxaLm2k0bYIJ" + }, + "outputs": [], + "source": [ + "cardinal = CardinalFst().fst\n", + "example = 'cardinal { negative: \"-\" integer: \"204\" }'\n", + "\n", + "apply_fst(example, cardinal)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Bc0-QCBHWg-8" + }, + "source": [ + "That's it! We've now completed all aspects of our `CardinalFst` from grammar writing to Verbalization. While we still have quite a few semiotic classes left, you will find that they build off the `CardinalFst` quite easily, making progression much simpler and straightforward.\n", + "\n", + ">**_Note:_**\n", + ">- `delete_tokens` is called on the completed graph, despite the token class occurring first in the tokenized string. This is because the function intersects with an initial WFST that deletes the tags. As such, the function must be passed a completed graph.\n", + ">- In our initial example, all tokens were enclosed within a `token` category. Insertion and deletion of this category is managed by the main [Classifier](#tokenize-and-classify) and [Verbalizer](#verbalize-and-verbalize-final) respectively and is not a concern during individual class grammar development.\n", + ">- Earlier in the tutorial we noted that NeMo ITN permutates all WFSTs unless the `preserve_order` tag is passed as part of the Classifier. This allows you to ignore possible variation in designing the verbalizer and focus on whatever form of processing is easiest for the grammar. That is, the decision to process the `negative` property before the `integer` property is not chosen because of a consequence of the French language but instead because it is easier to write out with `pynini`. \n", + ">- Conversely, if your language is completely invariant in this regard, it may be more efficient to pass `preserve_order` through the Classifier and manage the property here in the Verbalizer. This allows NeMo ITN to avoid building states and arcs for each permutation, reducing graph size and compiling time." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aFUrbSdJ8Wk7" + }, + "source": [ + "# Ordinal WFST " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w1b0Z7f5Z9Ar" + }, + "source": [ + "Ordinals is the class of numbers used for enumerating order or placement of entities in a series. In some languages, they are simply derivations of cardinal numbers. For instance, English enumerates order as `first, second, third, fourth, fifth....` After the third ordinal, they become a regular pattern of `cardinal + 'th'`.\n", + "\n", + "Meanwhile, other languages may reserve specific counting systems for ordinals. For example, while Korean uses a Chinese derived counting system for several Cardinal related tasks, it uses derivations from a native counting system for ordering:\n", + "\n", + "**Cardinal**/**Ordinal** = **English**\n", + "- il/cheot-jae = \"First\"\n", + "- i/dul-jae = \"Second\"\n", + "- sam/set-jae = \"Third\"\n", + "- sa/net-jae = \"Fourth\"\n", + "- o/daseot-jae = \"Fifth\"\n", + "\n", + "If your language is of the latter variety, you will likely need to begin development of Ordinal WFST by repeating Cardinal WFST development before proceeding. (Or make it part of your previous Cardinal WFST and combining with a `union` operation.) While you can extend coverage to the level of Cardinal WFST, you will find most Ordinals to be sufficiently covered by only enumerating to a few hundreds. (e.g. Is it common in your language to speak of the \"one millionth\" in an order and/or write out `1,000,000th`?)\n", + "\n", + "For this portion of the tutorial, we will focus on the first type of ordinals - those that primarily derived by altering Cardinals." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oq_xA8NPiANw" + }, + "source": [ + "## Grammar" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lhjcQS6oiD_w" + }, + "source": [ + "Continuing with our example language, we first begin by laying out our expected inputs and pinpointing a regular pattern to guide our WFSTs. We note the following examples:\n", + "\n", + " **English = French**\n", + " - \"first\" = \"premier/première\"\n", + " - \"second\" = \"second/seconde/deuxième\"\n", + " - \"third\" = \"troisième\"\n", + " - \"fourth\" = \"quatrième\"\n", + " - \"fifth\" = \"cinquième\"\n", + " - \"sixth\" = \"sixième\"\n", + " - \"seventh\" = \"septième\"\n", + "\n", + "From our examples inputs, it appears that spelling of French Ordinals follows a general format of: `cardinal + ième`. The only exceptions appear to be in the case of the first and second Ordinals - for which completely different roots appear - and the fourth and the fifth Ordinals - where the former drops the \"e\" at the end of the root (`quatre -> quatr`) and the latter appends a \"u\" (`cinq -> cinqu`). \n", + "\n", + "For the expected outputs, we observe the following examples:\n", + " - \"premier/première\" -> `1ᵉʳ/1ʳᵉ`\n", + " - \"second/seconde\" -> `2ᵈ/2ᵈᵉ`\n", + " - \"deuxième\" -> `2ᵉ`\n", + " - \"troisième\" -> `3ᵉ`\n", + " - \"quatrième\" -> `4ᵉ`\n", + " - \"cinquième\" -> `5ᵉ`\n", + " - \"sixième\" -> `6ᵉ`\n", + " - \"septième\" -> `7ᵉ`\n", + "\n", + "It appears that the output is simply the cardinal number of the root with an associated superscript. Since we have already constructed the Cardinal WFST, this means that the job of constructing an Ordinal WFST is simply a case of recognizing the cardinal root for the input and then utilizing a preconstructed Cardinal grammar to render the proper form alongside an associated superscript. That is, our tasks are to:\n", + "- Identify the proper superscript for the ordinal\n", + "- Change the ordinal back into a cardinal\n", + "- Use the Cardinal WFST to transform the cardinal into normalized form\n", + "- Properly render the ordinal using the normalized cardinal and proper superscript\n", + "\n", + "As information regarding the superscript will need to be conveyed through development of the Classifier, we will begin with creating the grammar necessary for rendering the ordinal as its cardinal root. \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AOUVZhiwT7hE" + }, + "source": [ + "### Stripping Suffixes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5nw0_lOTsEik" + }, + "source": [ + "Since French forms Ordinals by appending a suffix to Cardinals, we should start by creating a WFST to remove the suffix. Assuming that our grammar processes one token at a time, this means that we just need an WFST that will accept all tokens that end with \"ième\" and then delete the suffix from that token:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Rk89LhsxsHTO" + }, + "outputs": [], + "source": [ + "strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", + "graph_strip_morpheme = NEMO_SIGMA + strip_morpheme # accepts all strings until passed suffix, then deletes suffix" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pLg-PzdntV4N" + }, + "source": [ + "Now we can create a graph that permits all characters in a word token and deletes the ordinal suffix. (Note that this also means that the graph won't accept tokens without the suffix, helping us avoid false inputs.) \n", + "\n", + "We can now intersect this graph with our Cardinal WFST to now strip the suffixes from ordinals and treat them as cardinals. However, recall that our `CardinalFst` also inserted its own class tag. Obviously, we do not want to do this here as it will disrupt the formatting of the token. Instead, we should create a new subgraph *within* the `CardinalFst` class that will only produce the cardinals without tokens." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class CardinalFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"cardinal\", kind=\"classify\")\n", + " \n", + " ### Cardinal Grammar....\n", + " ### .....\n", + " graph = graph_trillions | zero \n", + "\n", + " ### Formatting grammar....\n", + " ### .....\n", + " graph = graph @ clean_cardinal\n", + " \n", + " ### NEW GRAPH\n", + " self.just_cardinals = graph # will produce cardinals without formatting\n", + "\n", + " ### Token insertion\n", + " optional_minus_graph = pynini.closure(\n", + " pynutil.insert(\"negative: \") + pynini.cross(\"moins\", \"\\\"-\\\"\") + \" \", 0, 1\n", + " )\n", + "\n", + " final_graph = optional_minus_graph + pynutil.insert(\"integer: \\\"\") + graph + pynutil.insert(\"\\\"\")\n", + "\n", + " final_graph = self.add_tokens(final_graph)\n", + "\n", + " self.fst = final_graph" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we call it for our graph:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vxDgBa4_t1nD" + }, + "outputs": [], + "source": [ + "graph_cardinal = CardinalFst().just_cardinals \n", + "graph_ordinal_regular_suffix = graph_strip_morpheme @ graph_cardinal" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hSpk5M7BuXRz" + }, + "source": [ + "Let's see if it works and gives us the desired cardinal:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7cJ7fieouY2r" + }, + "outputs": [], + "source": [ + "example = \"sixième\" # dervied from six/6\n", + "apply_fst(example, graph_ordinal_regular_suffix)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GtEuV7sOuxek" + }, + "source": [ + "Now we can consider the edge cases. Beyond the first and second ordinals, French exhibits irregular behavior in the following cases:\n", + "- If the cardinal root ends with an \"e\", the \"e\" is dropped before adding the suffix (e.g. \"quatrième\"). \n", + "- Cardinals ending with \"cinq\", \"neuf\", and \"dix\" change their endings to \"cinqu\", \"neuv\" , and \"diz\" before appending the suffix, respectively. \n", + "\n", + "We could start by proposing a WFST that replaces the suffix \"ième\" with \"e\" and then compose this onto the Cardinal WFST. If it is a legitimate cardinal, then there will be a path through CardinalFST and the integer will be rendered as normal. \n", + "\n", + "Meanwhile, the case of \"dix\", \"cinq\", and \"neuf\" would each require a distinct WFST as they are each a consequence of different rules of orthography and phonology. Like the case with \"e\", we could change each back to its root and then see if the CardinalWFST will permit a path with the new input. \n", + "\n", + "It is at this point that we can do a cost-benefit analysis and realize that all these cases can be managed by an explicit `string_map/string_file`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_9KTNQeIw4sq" + }, + "outputs": [], + "source": [ + "graph_root_change = pynini.string_map([(\"quatrième\", \"quatre\"),\n", + " (\"cinquième\",\t\"cinq\"),\n", + " (\"neuvième\",\t\"neuf\"),\n", + " (\"onzième\",\t\"onze\"),\n", + " (\"douzième\",\t\"douze\"),\n", + " (\"treizième\",\t\"treize\"),\n", + " (\"quatorzième\",\t\"quatorze\"),\n", + " (\"quinzième\",\t\"quinze\"),\n", + " (\"seizième\",\t\"seize\"),\n", + " (\"trentième\",\t\"trente\"),\n", + " (\"quarantième\",\t\"quarante\"),\n", + " (\"cinquantième\",\t\"cinquante\"),\n", + " (\"soixantième\",\t\"soixante\"),\n", + " (\"millième\",\t\"mille\"),\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eo2_keFVqaY4" + }, + "source": [ + "We could then concatenate these with a WFST that accepts all tokens with these endings and then change the endings as desired. These will provide the cardinal roots just as effectively. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "O7I29ezmxylx" + }, + "source": [ + "The same can be said for \"premier/première\" and \"second/seconde\":" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3JZoz51VyGS6" + }, + "outputs": [], + "source": [ + "graph_firsts = pynini.string_map([(\"premier\", \"un\"),(\"première\", \"un\")])\n", + "graph_seconds = pynini.string_map([(\"second\", \"deux\"),(\"seconde\", \"deux\")])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NJ9BGGAwyTQ5" + }, + "source": [ + "*Note: We graph separately to manage their different superscripts later on.*\n", + "\n", + "Depending on your language of focus, the choice of implicitly reversing the root token or explicitly mapping back to root will be the most efficient, but it is worth considering both options if only to check your understanding of the language." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8PgVwDRRq9gr" + }, + "source": [ + "Putting our grammar together, we have:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ko2kAeKwrRSH" + }, + "outputs": [], + "source": [ + "strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", + "\n", + "graph_root_change = pynini.string_map([(\"quatrième\", \"quatre\"),\n", + " (\"cinquième\",\t\"cinq\"),\n", + " (\"neuvième\",\t\"neuf\"),\n", + " (\"onzième\",\t\"onze\"),\n", + " (\"douzième\",\t\"douze\"),\n", + " (\"treizième\",\t\"treize\"),\n", + " (\"quatorzième\",\t\"quatorze\"),\n", + " (\"quinzième\",\t\"quinze\"),\n", + " (\"seizième\",\t\"seize\"),\n", + " (\"trentième\",\t\"trente\"),\n", + " (\"quarantième\",\t\"quarante\"),\n", + " (\"cinquantième\",\t\"cinquante\"),\n", + " (\"soixantième\",\t\"soixante\"),\n", + " (\"millième\",\t\"mille\"),\n", + "])\n", + "\n", + "# Component will accept all tokens that end with desired strings\n", + "graph_get_cardinal = NEMO_SIGMA + (strip_morpheme | graph_root_change) \n", + "\n", + "graph_firsts = pynini.string_map([(\"premier\", \"un\"),(\"première\", \"un\")])\n", + "graph_seconds = pynini.string_map([(\"second\", \"deux\"),(\"seconde\", \"deux\")])\n", + "\n", + "graph_get_cardinal = pynini.union(graph_firsts, graph_seconds, graph_get_cardinal) \n", + "\n", + "graph_cardinal = CardinalFst().just_cardinals\n", + "\n", + "graph_ordinal = graph_get_cardinal @ graph_cardinal" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ESxY3LsCdE8q" + }, + "outputs": [], + "source": [ + "apply_fst(\"sixième\", graph_ordinal)\n", + "apply_fst(\"première\", graph_ordinal)\n", + "apply_fst(\"seconde\", graph_ordinal)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qo_g8UdoUFJB" + }, + "source": [ + "## Classifier" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kemhdKAjzEIa" + }, + "source": [ + "Now that we've found a way to pass the work of the Ordinal grammar back onto the Cardinal grammar, we can move onto the Classifier. Like before, we need to inherit from `GraphFst` to properly insert token formatting and required attributes. As well, we will again use the `integer` property to tag our digit string.\n", + "\n", + "Indeed, the only major difference between the Ordinal Classifier and the Cardinal Classifier is the replacement of optional `negative` attribute with the `morphosyntactic_feature` attribute to indicate the superscript function." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EHM4Y3TW2nXT" + }, + "source": [ + "Since we are relying on the `CardinalFst` class in our grammar, we want to consider how to instantiate an instance of it. Since our ultimate goal is to build a Classifier that unites all semiotic classes, it makes sense to simply use the `CardinalFst` that we will need to call for our ITN and pass it as an argument to our new class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 273 + }, + "id": "KsmPhWSa3LF_", + "outputId": "9e881ca9-a926-4249-dda8-9c52175569b5" + }, + "outputs": [], + "source": [ + "def __init__(self, cardinal: GraphFst):\n", + " super().__init__(name=\"ordinal\", kind=\"classify\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CtBQ-udB3S5Q" + }, + "source": [ + "To clear up the namespace, we will now be importing from the NeMo implementation of `CardinalFst` for French." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "L-JAcidf4QQg" + }, + "outputs": [], + "source": [ + "from nemo_text_processing.inverse_text_normalization.fr.taggers.cardinal import CardinalFst\n", + "\n", + "class OrdinalFst(GraphFst):\n", + " def __init__(self, cardinal: GraphFst):\n", + " super().__init__(name=\"ordinal\", kind=\"classify\")\n", + " graph_cardinal = cardinal.graph_no_exception # NeMo equivalent to self.just_cardinals" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FQfkAqZavCAB" + }, + "source": [ + "We now add in our grammar:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uUQ4BLuivGut" + }, + "outputs": [], + "source": [ + "class OrdinalFst(GraphFst):\n", + " def __init__(self, cardinal: GraphFst):\n", + " super().__init__(name=\"ordinal\", kind=\"classify\")\n", + " graph_cardinal = cardinal.graph_no_exception # may replace\n", + "\n", + " strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", + "\n", + " graph_root_change = pynini.string_map([(\"quatrième\", \"quatre\"),\n", + " (\"cinquième\",\t\"cinq\"),\n", + " (\"neuvième\",\t\"neuf\"),\n", + " (\"onzième\",\t\"onze\"),\n", + " (\"douzième\",\t\"douze\"),\n", + " (\"treizième\",\t\"treize\"),\n", + " (\"quatorzième\",\t\"quatorze\"),\n", + " (\"quinzième\",\t\"quinze\"),\n", + " (\"seizième\",\t\"seize\"),\n", + " (\"trentième\",\t\"trente\"),\n", + " (\"quarantième\",\t\"quarante\"),\n", + " (\"cinquantième\",\t\"cinquante\"),\n", + " (\"soixantième\",\t\"soixante\"),\n", + " (\"millième\",\t\"mille\"),\n", + " ])\n", + " \n", + " # Component will accept all tokens that end with desired strings\n", + " graph_get_cardinal = NEMO_SIGMA + (strip_morpheme | graph_root_change) \n", + "\n", + " graph_firsts = pynini.string_map([(\"premier\", \"un\"),(\"première\", \"un\")])\n", + " graph_seconds = pynini.string_map([(\"second\", \"deux\"),(\"seconde\", \"deux\")])\n", + "\n", + " graph_get_cardinal = pynini.union(graph_firsts, graph_seconds, graph_get_cardinal) \n", + "\n", + " graph_ordinal = graph_get_cardinal @ graph_cardinal\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F_6EXPRMvnp2" + }, + "source": [ + "Now we come to the `morphosyntactic_features` property - a linguistic term for aspects of a word related to grammar. If intending to deploy your WFST through Sparrowhawk, this is the only ordinal property that is permitted (outside of the universal properties like `preserve_order`) and thus must carry all information regarding how to properly normalize the ordinal. (If Sparrowhawk deployment is not necessary, you may add additional properties to the tag.)\n", + "\n", + "How should we convey this information? Since the Verbalizer will be the main interface for our tags, it really does not matter - so long as we can reliably process the features. For the purposes of French, we just need `morphosyntactic_features` to decide the following:\n", + "- Insert the specific superscripts for \"premier/première\" or \"second/seconde\"\n", + "- Insert \"ᵉ\" otherwise\n", + "\n", + "We will also introduce another aspect of French Ordinals: they can be either plural or singular, identified by the suffix \"s\" on input and superscript \"ˢ\" on output. As such, our `morphosyntactic_features` should also decide the additional property:\n", + "- Insert the plural superscript " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "atctz6p-2GtV" + }, + "source": [ + "Since the default superscript is near universal, we will just specify this in our WFST and focus on the second and first ordinals as specific cases. We will create a `graph_morpheme` component that inserts the default superscript - indicated with a standard \"e\" to avoid possible encoding issues. We will then append a WFST that will graph any possible plural marker - \"s\" - as part the `morphosyntactic_features`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ui99osyP2UuQ" + }, + "outputs": [], + "source": [ + "graph_morpheme = pynutil.insert(\"e\") # Insert e superscript\n", + "graph_plural = pynini.closure(pynini.accep(\"s\"), 0, 1) # We create an acceptor since we must process the possible \"s\"\n", + "\n", + "graph_morpheme_component = graph_morpheme + graph_plural\n", + "\n", + "graph_morphosyntactic_features = (pynutil.insert(\" morphosyntactic_features: \\\"\") \n", + " + graph_morpheme_component\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QAlqubA25gq0" + }, + "source": [ + "Introducing the `integer` feature:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rs2TyIBc5la6" + }, + "outputs": [], + "source": [ + "graph_reg_ordinals = graph_get_cardinal @ graph_cardinal # Rewriting ordinals to remove the first and second ordinal.\n", + "\n", + "graph_ordinal = pynutil.insert(\"integer: \\\"\") + graph_reg_ordinals + pynutil.insert(\"\\\"\")\n", + "graph_ordinal += graph_morphosyntactic_features" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xoqk20Pi2gT8" + }, + "source": [ + "For the first and second ordinals, we can explicitly state their mappings, as these occurrences are invariable. (First and second ordinals do not need to accommodate being the endings of other terms.) As such, we can just have mappings from the token to the superscripts." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "54aqdH_P63Ea" + }, + "outputs": [], + "source": [ + "firsts = pynini.string_map([(\"premier\", \"er\"), (\"première\",\"re\")])\n", + "firsts += graph_plural # Still accepts plural marker in superscript\n", + "seconds = pynini.string_map([(\"second\", \"d\"),(\"seconde\", \"de\")])\n", + "seconds += graph_plural \n", + "\n", + "graph_firsts = pynutil.insert(\"integer: \\\"1\\\" morphosyntactic_features: \\\"\") + firsts\n", + "graph_seconds = pynutil.insert(\"integer: \\\"2\\\" morphosyntactic_features: \\\"\") + seconds" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D2vQ4m7o7p84" + }, + "source": [ + "Placing them in our class:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "w_JKT8JMf-Mz" + }, + "outputs": [], + "source": [ + "class OrdinalFst(GraphFst):\n", + " def __init__(self, cardinal: GraphFst):\n", + " super().__init__(name=\"ordinal\", kind=\"classify\")\n", + " graph_cardinal = cardinal.graph_no_exception # may replace\n", + "\n", + " strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", + "\n", + " graph_root_change = pynini.string_map([(\"quatrième\", \"quatre\"),\n", + " (\"cinquième\",\t\"cinq\"),\n", + " (\"neuvième\",\t\"neuf\"),\n", + " (\"onzième\",\t\"onze\"),\n", + " (\"douzième\",\t\"douze\"),\n", + " (\"treizième\",\t\"treize\"),\n", + " (\"quatorzième\",\t\"quatorze\"),\n", + " (\"quinzième\",\t\"quinze\"),\n", + " (\"seizième\",\t\"seize\"),\n", + " (\"trentième\",\t\"trente\"),\n", + " (\"quarantième\",\t\"quarante\"),\n", + " (\"cinquantième\",\t\"cinquante\"),\n", + " (\"soixantième\",\t\"soixante\"),\n", + " (\"millième\",\t\"mille\"),\n", + " ])\n", + " \n", + " # Component will accept all tokens that end with desired strings\n", + " graph_get_cardinal = NEMO_SIGMA + (strip_morpheme | graph_root_change) \n", + "\n", + " # Graph will map ordinals beyond second ordinal to their cardinals\n", + " graph_reg_ordinals = graph_get_cardinal @ graph_cardinal\n", + "\n", + " # Graphing morphosyntactic_features\n", + " graph_morpheme = pynutil.insert(\"e\") # Insert e superscript\n", + " graph_plural = pynini.accep(\"s\").ques # ques is equivalent to pynini.closure(, 0, 1)\n", + "\n", + " graph_morpheme_component = graph_morpheme + graph_plural\n", + "\n", + " graph_morphosyntactic_features = (pynutil.insert(\" morphosyntactic_features: \\\"\") \n", + " + graph_morpheme_component\n", + " )\n", + "\n", + " # Adding in the `integer` property:\n", + " graph_ordinal = pynutil.insert(\"integer: \\\"\") + graph_reg_ordinals + pynutil.insert(\"\\\"\")\n", + " graph_ordinal += graph_morphosyntactic_features \n", + "\n", + " # Case of first and second ordinals\n", + " firsts = pynini.string_map([(\"premier\", \"er\"), (\"première\",\"re\")])\n", + " firsts += graph_plural # Still accepts plural marker in superscript\n", + " seconds = pynini.string_map([(\"second\", \"d\"),(\"seconde\", \"de\")])\n", + " seconds += graph_plural \n", + "\n", + " graph_firsts = pynutil.insert(\"integer: \\\"1\\\" morphosyntactic_features: \\\"\") + firsts\n", + " graph_seconds = pynutil.insert(\"integer: \\\"2\\\" morphosyntactic_features: \\\"\") + seconds\n", + "\n", + " # All together\n", + " graph_ordinal = pynini.union(graph_ordinal, graph_firsts, graph_seconds)\n", + " self.fst = graph_ordinal.optimize()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CpGHVg6chmA0" + }, + "source": [ + "Trying out on some examples:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "b5DL3PZRhpc8" + }, + "outputs": [], + "source": [ + "cardinal = CardinalFst()\n", + "ordinal = OrdinalFst(cardinal).fst\n", + "\n", + "apply_fst(\"premier\", ordinal)\n", + "apply_fst(\"premiers\", ordinal)\n", + "apply_fst(\"seconde\", ordinal)\n", + "apply_fst(\"douzièmes\", ordinal)\n", + "apply_fst(\"cent-cinquièmes\", ordinal)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MNQVgiv-UK29" + }, + "source": [ + "### Special Tokens" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UdiNAHGh71O9" + }, + "source": [ + "If you are particularly astute, you may have noticed that we have not closed the quotations around the `morphosyntactic_features` throughout, despite doing so for `integer`. This is not a typo, as there is one more aspect of the Classifier that must be addressed: special cases.\n", + "\n", + "For your language, you may notice that there are occasional exceptions to writing rules that are signaled by a specific vocabulary token in a string. As this must be communicated to our Verbalizer, it is important that we signal this vocabulary through our Classifier. \n", + "\n", + "For French, this can occur in the normalization of centuries. When using Ordinals to indicate centuries, French commonly writes with Roman numerals. For example:\n", + "- \"Fifth century\" -> \"cinquième siècle\" -> `Vᵉ siècle` \n", + "- \"Twentieth century\" -> \"vintième siècle\" -> `XXᵉ siècle` \n", + "\n", + "As such, we must allow our Classifier to pass on the information that \"siècle\" follows an ordinal to our Verbalizer, so it may normalize with Roman numerals. We accomplish this by appending a WFST that accepts special tokens that follow our Ordinals, adding them to our `morphosyntactic_features` attribute with a forward slash to delineate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MsWnT4BfQKcC" + }, + "outputs": [], + "source": [ + "special_tokens = pynini.accep(\"siècle\")\n", + "\n", + "graph_special_tokens = delete_space + pynutil.insert(\"/\") + special_tokens # We need to delete the space in between this token and the following one.\n", + "graph_special_tokens = pynini.closure(graph_special_tokens, 0, 1)\n", + "\n", + "graph_ordinal += graph_special_tokens + pynutil.insert(\"\\\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "698_n5SFQ_jP" + }, + "source": [ + "*Once again, it is advised to retain a tsv file in `data` to quickly append these key-words.*\n", + "\n", + "Having taken care of the special case, we may now call `add_tokens` and complete the graph (fully written out below)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nZ1dkft0Riou" + }, + "outputs": [], + "source": [ + "class OrdinalFst(GraphFst):\n", + " def __init__(self, cardinal: GraphFst):\n", + " super().__init__(name=\"ordinal\", kind=\"classify\")\n", + " graph_cardinal = cardinal.graph_no_exception # may replace\n", + "\n", + " strip_morpheme = pynutil.delete(\"ième\") # deletes suffix\n", + "\n", + " graph_root_change = pynini.string_map([(\"quatrième\", \"quatre\"),\n", + " (\"cinquième\",\t\"cinq\"),\n", + " (\"neuvième\",\t\"neuf\"),\n", + " (\"onzième\",\t\"onze\"),\n", + " (\"douzième\",\t\"douze\"),\n", + " (\"treizième\",\t\"treize\"),\n", + " (\"quatorzième\",\t\"quatorze\"),\n", + " (\"quinzième\",\t\"quinze\"),\n", + " (\"seizième\",\t\"seize\"),\n", + " (\"trentième\",\t\"trente\"),\n", + " (\"quarantième\",\t\"quarante\"),\n", + " (\"cinquantième\",\t\"cinquante\"),\n", + " (\"soixantième\",\t\"soixante\"),\n", + " (\"millième\",\t\"mille\"),\n", + " ])\n", + " \n", + " # Component will accept all tokens that end with desired strings\n", + " graph_get_cardinal = NEMO_SIGMA + (strip_morpheme | graph_root_change) \n", + "\n", + " # Graph will map ordinals beyond second ordinal to their cardinals\n", + " graph_reg_ordinals = graph_get_cardinal @ graph_cardinal\n", + "\n", + " # Graphing morphosyntactic_features\n", + " graph_morpheme = pynutil.insert(\"e\") # Insert e superscript\n", + " graph_plural = pynini.accep(\"s\").ques # We create an acceptor since we must process the possible \"s\"\n", + "\n", + " graph_morpheme_component = graph_morpheme + graph_plural\n", + "\n", + " graph_morphosyntactic_features = (pynutil.insert(\" morphosyntactic_features: \\\"\") \n", + " + graph_morpheme_component\n", + " )\n", + "\n", + " # Adding in the `integer` property:\n", + " graph_ordinal = pynutil.insert(\"integer: \\\"\") + graph_reg_ordinals + pynutil.insert(\"\\\"\")\n", + " graph_ordinal += graph_morphosyntactic_features \n", + "\n", + " # Case of first and second ordinals\n", + " firsts = pynini.string_map([(\"premier\", \"er\"), (\"première\",\"re\")])\n", + " firsts += graph_plural # Still accepts plural marker in superscript\n", + " seconds = pynini.string_map([(\"second\", \"d\"),(\"seconde\", \"de\")])\n", + " seconds += graph_plural \n", + "\n", + " graph_firsts = pynutil.insert(\"integer: \\\"1\\\" morphosyntactic_features: \\\"\") + firsts\n", + " graph_seconds = pynutil.insert(\"integer: \\\"2\\\" morphosyntactic_features: \\\"\") + seconds\n", + "\n", + "\n", + " # Special tokens\n", + " special_tokens = pynini.accep(\"siècle\")\n", + "\n", + " graph_special_tokens = delete_space + pynutil.insert(\"/\") + special_tokens # We need to delete the space in between this token and the following one.\n", + " graph_special_tokens = pynini.closure(graph_special_tokens, 0, 1)\n", + "\n", + " graph_ordinal += graph_special_tokens + pynutil.insert(\"\\\"\")\n", + "\n", + " # Finishing\n", + " graph_ordinal = self.add_tokens(graph_ordinal)\n", + " self.fst = graph_ordinal.optimize()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7a4zBo-YS1QD" + }, + "source": [ + "## Verbalizer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zYbrcGyGS2rW" + }, + "source": [ + "The initial part of the Ordinal Verbalizer is similar to the Cardinal WFST: we simply need to build a Verbalizer that inherits from `GraphFST` and removes the `integer` property tag. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KUv99A_rYjb9" + }, + "outputs": [], + "source": [ + "class OrdinalFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"ordinal\", kind=\"verbalize\")\n", + " graph_integer = (\n", + " pynutil.delete(\"integer:\")\n", + " + delete_space\n", + " + pynutil.delete(\"\\\"\")\n", + " + pynini.closure(NEMO_DIGIT, 1)\n", + " + pynutil.delete(\"\\\"\")\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zKCt_EapZXGW" + }, + "source": [ + "Now we need to manage the `morphosyntactic_features` component. The first steps seem simple enough: delete the property tag and replace the superscript indicators with the actual superscripts. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yoa_mXMLabrU" + }, + "outputs": [], + "source": [ + " # Create mappings for all superscripts\n", + " superscript = pynini.union(\n", + " pynini.cross(\"e\", \"ᵉ\"), # only delete first quote since there may be more features\n", + " pynini.cross(\"d\", \"ᵈ\"),\n", + " pynini.cross(\"r\", \"ʳ\"),\n", + " pynini.cross(\"s\", \"ˢ\"),\n", + " )\n", + "\n", + " # Append to deletion of feature property. Note that we use plus closure for multiple superscripts.\n", + " graph_morphosyntactic_features = pynutil.delete(\" morphosyntactic_features: \\\"\") + superscript.plus" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xOA7_MsUrSJS" + }, + "source": [ + "### Romanization" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "K_SaG0DUa2t7" + }, + "source": [ + "Now we come to the possible Romanization component. Since we need to graph the superscript components as following the number, we want to design our graph so that `morphosyntactic_features` is the last component of the graph. However, we do not know that we need Romanization until we see the `morphosyntactic_features` component. As such, we need to design our graph such that two options are available initially for an input, but only one allows full traversal." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7dalc-tablG-" + }, + "source": [ + "![romanization.png](images/romanization.PNG)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mPTNCddNcEEE" + }, + "source": [ + "In cases where your WFST decisions are dependent on latter parts of an input string, permitting the union of two separate paths when only one is valid usually assists, as a standard pathing heuristic will only choose the valid path. \n", + "\n", + "In the case of French, this would require us to separate our Verbalizer into two parts: one for Arabic numerals and one for Roman numerals. For the Arabic WFST, we simply conclude the graph. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0YSy1PYOcuyD" + }, + "outputs": [], + "source": [ + "graph_integer = (\n", + " pynutil.delete(\"integer:\")\n", + " + delete_space\n", + " + pynutil.delete(\"\\\"\")\n", + " + pynini.closure(NEMO_DIGIT, 1)\n", + " + pynutil.delete(\"\\\"\")\n", + " )\n", + "graph_Arabic = graph_integer + graph_morphosyntactic_features + pynutil.delete(\"\\\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nnXjUU5Pf7Sh" + }, + "source": [ + "For the Roman graph, things get a bit trickier. Ideally, we would want to build a WFST that maps each digit of `graph_Arabic` to a Roman equivalent. However, consider the following examples:\n", + "- 1 -> I\n", + "- 10 -> X\n", + "- 11 -> XI\n", + "- 100 -> C\n", + "- 101 -> CI\n", + "- 110 -> CX\n", + "- 111 -> CXI\n", + "\n", + "Since Roman numerals do not preserve powers of ten through digit placement, we will need to design separate FSTs for each digit position and apply them accordingly. As this can quickly become intensive, we will only work to enumerate the Ordinals from 1 to 100. (Note: We are doing this to accommodate centuries; there is little likelihood that any century beyond the 99th will be used in regular strings.)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3-fQHMc2iQrz" + }, + "source": [ + "First we design our graphs for converting from Arabic to Roman numerals:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "d6PDySykiXTh" + }, + "outputs": [], + "source": [ + "digits = pynini.string_map([(\"1\", \"I\"),\n", + " (\"2\",\t\"II\"),\n", + " (\"3\",\t\"III\"),\n", + " (\"4\",\t\"IV\"),\n", + " (\"5\",\t\"V\"),\n", + " (\"6\",\t\"VI\"),\n", + " (\"7\",\t\"VII\"),\n", + " (\"8\",\t\"VIII\"),\n", + " (\"9\",\t\"IX\"),\n", + " ])\n", + "tens = pynini.string_map([(\"1\", \"X\"),\n", + " (\"2\",\t\"XX\"),\n", + " (\"3\",\t\"XXX\"),\n", + " (\"4\",\t\"XL\"),\n", + " (\"5\",\t\"L\"),\n", + " (\"6\",\t\"LX\"),\n", + " (\"7\",\t\"LXX\"),\n", + " (\"8\",\t\"LXXX\"),\n", + " (\"9\",\t\"XC\"),\n", + " ])\n", + "zero = pynutil.delete(\"0\") # No Roman representation for zero." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wb-LmwJdk59m" + }, + "source": [ + "Now we build two separate filters: one will accept only single digit Arabic numerals and the other will accept two digit Arabic numerals. For this we can use `NEMO_DIGIT`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DW3oD7Hbli2X" + }, + "outputs": [], + "source": [ + "map_one_digit = NEMO_DIGIT\n", + "map_two_digits = NEMO_DIGIT ** 2 # pynini overloads the exponent function to allow self-concatenation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xtYKLy9AmJZS" + }, + "source": [ + "We now build mappings between two digit Arabic numerals and Roman numerals, composing them onto the filters:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dUy7uEUXmT_g" + }, + "outputs": [], + "source": [ + "graph_one_digit_romans = NEMO_DIGIT @ digits\n", + "\n", + "graph_two_digit_romans = tens + (digits | zero)\n", + "graph_two_digit_romans = map_two_digits @ graph_two_digit_romans\n", + "\n", + "graph_romans = graph_one_digit_romans | graph_two_digit_romans" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JEinyAMdm7RJ" + }, + "source": [ + "We now take care of the occurrence of \"siècle\" before composing onto `graph_integer`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ERO19BbynPNX" + }, + "outputs": [], + "source": [ + "graph_romans = (graph_integer @ graph_romans) + graph_morphosyntactic_features\n", + "graph_romans += pynini.cross(\"/\", \" \") + \"siècle\" + pynutil.delete(\"\\\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zN-fwrCGoToQ" + }, + "source": [ + "We finalize with a union and calling `delete_tokens`, the complete Verbalizer now being::" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kr2wcToAofWB" + }, + "outputs": [], + "source": [ + "class OrdinalFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"ordinal\", kind=\"verbalize\")\n", + "\n", + " # Maps integer and removes attribute\n", + " graph_integer = (\n", + " pynutil.delete(\"integer:\")\n", + " + delete_space\n", + " + pynutil.delete(\"\\\"\")\n", + " + pynini.closure(NEMO_DIGIT, 1)\n", + " + pynutil.delete(\"\\\"\")\n", + " )\n", + "\n", + " # Create mappings for all superscripts\n", + " superscript = pynini.union(\n", + " pynini.cross(\"e\", \"ᵉ\"), # only delete first quote since there may be more features\n", + " pynini.cross(\"d\", \"ᵈ\"),\n", + " pynini.cross(\"r\", \"ʳ\"),\n", + " pynini.cross(\"s\", \"ˢ\"),\n", + " )\n", + "\n", + " # Append to deletion of feature property. Note that we use plus closure for multiple superscripts.\n", + " graph_morphosyntactic_features = pynutil.delete(\" morphosyntactic_features: \\\"\") + superscript.plus\n", + "\n", + " # Writing WFST for Arabic\n", + " graph_Arabic = graph_integer + graph_morphosyntactic_features + pynutil.delete(\"\\\"\")\n", + "\n", + " # Mapping Roman numerals\n", + " digits = pynini.string_map([(\"1\", \"I\"),\n", + " (\"2\",\t\"II\"),\n", + " (\"3\",\t\"III\"),\n", + " (\"4\",\t\"IV\"),\n", + " (\"5\",\t\"V\"),\n", + " (\"6\",\t\"VI\"),\n", + " (\"7\",\t\"VII\"),\n", + " (\"8\",\t\"VIII\"),\n", + " (\"9\",\t\"IX\"),\n", + " ])\n", + " tens = pynini.string_map([(\"1\", \"X\"),\n", + " (\"2\",\t\"XX\"),\n", + " (\"3\",\t\"XXX\"),\n", + " (\"4\",\t\"XL\"),\n", + " (\"5\",\t\"L\"),\n", + " (\"6\",\t\"LX\"),\n", + " (\"7\",\t\"LXX\"),\n", + " (\"8\",\t\"LXXX\"),\n", + " (\"9\",\t\"XC\"),\n", + " ])\n", + " zero = pynutil.delete(\"0\") # No Roman representation for zero.\n", + "\n", + " # filters for Roman digits\n", + " map_one_digit = NEMO_DIGIT\n", + " map_two_digits = NEMO_DIGIT ** 2 # pynini overloads the exponent function to allow self-concatenation.\n", + "\n", + " # Composing onto roman digits\n", + " graph_one_digit_romans = NEMO_DIGIT @ digits\n", + "\n", + " graph_two_digit_romans = tens + (digits | zero)\n", + " graph_two_digit_romans = map_two_digits @ graph_two_digit_romans\n", + "\n", + " graph_romans = graph_one_digit_romans | graph_two_digit_romans\n", + "\n", + " # Writing WFST for Roman\n", + " graph_romans = (graph_integer @ graph_romans) + graph_morphosyntactic_features\n", + " graph_romans += pynini.cross(\"/\", \" \") + \"siècle\" + pynutil.delete(\"\\\"\")\n", + "\n", + " # Final composition\n", + " graph = (graph_romans | graph_Arabic)\n", + "\n", + " delete_tokens = self.delete_tokens(graph)\n", + " self.fst = delete_tokens.optimize()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Trying out our examples:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "example_regular = 'ordinal { integer: \"12\" morphosyntactic_features: \"es\" }'\n", + "example_roman = 'ordinal { integer: \"12\" morphosyntactic_features: \"es/siècle\" }'\n", + "\n", + "fst = OrdinalFst().fst\n", + "\n", + "apply_fst(example_regular, fst)\n", + "apply_fst(example_roman, fst)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yBgLhTq9pWZe" + }, + "source": [ + "We have now completed an Ordinal WFST from the ground up, allowing a separate numbering system for special cases." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-W1-BMVJUXXk" + }, + "source": [ + "## Final notes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kR7E64P4pPU_" + }, + "source": [ + "Before moving on, there are some key takeaways that you may find useful for most (if not all) languages:\n", + "- Many ordinal systems rely on alteration of Cardinals. Even in the example of Korean, it is using a pre-existing counting system and adding a suffix to indicate ordering. As such, your Ordinal WFST will likely follow this tutorial's structure of changing the Ordinal to its original root and then relying on your Cardinal WFST for the majority of processing.\n", + "- The `morphosyntactic_features` property will carry the vast majority of information necessary for normalization through your Verbalizer.\n", + "- While not all writing systems have the same quirk as using Roman numerals in reference to centuries, you will likely find cases in your language when a specific token indicates unique rules for a semiotic class. Carrying this information to the Verbalizer is usually the simplest means of preserving the token while also facilitating normalization. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Rx8-LuJOUaa5" + }, + "source": [ + "# Decimal WFST " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D2MRXYxz8TGA" + }, + "source": [ + "\n", + "If the Cardinal WFST is the most crucial element of a normalization grammar, the construction of the Decimal WFST is a close second. Much like in the case of constructing Ordinals from Cardinal grammars, many aspects of the Decimal WFST will be reused throughout your other semiotic classes.\n", + "\n", + "To get started, you should study the numerical conventions in your language. In particular, you should take note of the following:\n", + "- How is the decimal component of a number pronounced in your language of focus. (e.g. The English number `1.33` can be verbalized as \"one point three three\" or \"one and thirty three hundredths.\")\n", + "- What is the punctuation mark used for decimal demarcation? (In North America, several writing systems use `.` while European nations will use `,`.)\n", + "- Are there general rules regarding pronunciation/formatting of numbers past the decimal demarcation? (e.g. Does your language pronounce each digit or pronounce as a series of three digit numbers?)\n", + "\n", + "Such questions will likely require some deep familiarity with the language, and it may benefit to ask a native speaker for some input. Of course, the level of depth is dependent on your needs, but researching these questions will help your normalization system appear more organic." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UsK78ib4N-gb" + }, + "source": [ + "## Grammar" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "p4CLOOA9OAwZ" + }, + "source": [ + "In the case of French, we have the following guidelines:\n", + "- French uses the comma ( `,` ) for decimal delineation. It is articulated as \"virgule\".\n", + "- Decimals can be read as a series of digits or grouped as Cardinal numbers arbitrarily. (e.g. \"`.333` can be \"virgule trois trois trois\" or \"virgule trois-cent-trente-trois\".) \n", + "\n", + "As such, our grammar needs to accommodate the following pattern: \n", + "\n", + "`cardinal + \"virgule\" + string_of_cardinals`\n", + "\n", + "Given our experience with our previous WFSTs, this seems simple enough. We assume we have an instance of CardinalFST available and create a subcomponent to map the integer portion of a decimal:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XSp9FTzhf0XZ" + }, + "outputs": [], + "source": [ + "cardinal = CardinalFst().graph_no_exception # NeMo equivalent of just_cardinals\n", + "\n", + "# place cardinal under closure to permit values <=1\n", + "graph_integer = pynini.closure(cardinal, 0, 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bk3_3iawgAZE" + }, + "source": [ + "Compose it on a subcomponent that detects the delineator \"virgule\":" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UMzfAKkngH6z" + }, + "outputs": [], + "source": [ + "delete_virgule = pynutil.delete(\"virgule\")\n", + "graph_decimal = graph_integer + delete_space + delete_virgule" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GXjbtbLYgn17" + }, + "source": [ + "And permit the occurrence of several strings of cardinals to follow:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LMMNBJz8gtTA" + }, + "outputs": [], + "source": [ + "graph_string_of_cardinals = delete_space + graph_cardinal\n", + "graph_string_of_cardinals = pynini.closure(graph_string_of_cardinals, 1)\n", + "\n", + "graph_decimal += graph_string_of_cardinals" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jTgnRLddhGdE" + }, + "source": [ + "Let us try an example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "D4rjDh0ShJAp" + }, + "outputs": [], + "source": [ + "example = \"trois virgule trois cinquante-cinq\" \n", + "apply_fst(example, graph_decimal) # Should output only the cardinals in the string" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RfD1d9JOioyl" + }, + "source": [ + "### Ambiguity?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3IaI1mCIe_6i" + }, + "source": [ + "Note that our decision to include multiple strings of cardinals after the decimal marker has introduced some ambiguity into our WFST. Consider if a decimal number was followed by an integer series (e.g. `2.5, 5, 6`). Now what should be an application of one DecimalFST and two applications of a CardinalFST can be interpreted as a single DecimalFST application (e.g. `2.556`). What can be done?\n", + "\n", + "While we will address this in greater depth later (see [Tokenize and Classify](#tokenize-and-classify)), the short answer is that cases such as these must be calibrated according to use and linguistic intuition. As this is an inherent ambiguity in the language and its writing system, we can never truly remove this possibility without restricting our ability to model the language. However, we can rely on a few logical assumptions to guide our decision making:\n", + "- Unless the grammar is deployed in a restrictive setting (e.g. a Financial or environment where strings of numbers are often read in series) it's not likely for a valid string to exhibit this level of ambiguity. Speakers typically try to reduce possible ambiguity in their language production and would likely rephrase to avoid issues such as these. [See Grice's maxims](https://en.wikipedia.org/wiki/Cooperative_principle).\n", + "- While a language may allow a specific string by *rule*, speakers may typically avoid them *in practice* due to conventions or difficulty. In our case, while it may be possible to read `2,100 05` as \"deux virgule dix-mille-cinq\" (\"two point ten-thousand and five\"), it's dubious that a speaker would find such easier to read than \"deux virgule une zéro zéro zéro cinq\". (The place value of large strings tend to take longer to recognize.)\n", + "\n", + "While hardly satisfying, these two points will allow us to dismiss *some* worry. With the former observation being outside our grammar's ability to manage, we accommodate the latter point by using an alternate WFST from our CardinalFST: `numbers_up_to_million`. (To utilize in your own language, create a WFST in the Cardinal class right before building up to `graph_millions`. Again, calling `optimize` is advised.)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "piNe1AWspa4J" + }, + "outputs": [], + "source": [ + "cardinal = CardinalFst().numbers_up_to_million\n", + "\n", + "# place cardinal under closure to permit values <=1\n", + "graph_integer = pynini.closure(cardinal, 0, 1)\n", + "\n", + "delete_virgule = pynutil.delete(\"virgule\")\n", + "graph_decimal = graph_integer + delete_space + delete_virgule\n", + "\n", + "graph_string_of_cardinals = delete_space + cardinal\n", + "graph_string_of_cardinals = pynini.closure(graph_string_of_cardinals, 1)\n", + "\n", + "graph_decimal += graph_string_of_cardinals" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "B1gglt0tfM5V" + }, + "source": [ + "## Classifier" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fVkOWkncgOZc" + }, + "source": [ + "Like with our previous WFSTs, the main duty for the classifier is inserting the necessary properties for the semiotic token. For the `decimal` tag, the following properties are used:\n", + "- `integer_part` - indicates value before decimal marker\n", + "- `fractional_part` - indicates values after the decimal marker\n", + "- `negative` - indicates if value is positive or negative (Optional)\n", + "- `quantity` - designates if decimal is in regards to a specific quantity. (See Quantities.)\n", + "\n", + "We can begin by inserting the `integer_part` around our `cardinal` subcomponent and the `fractional_part` around our `graph_string_of_cardinals`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_zw_cDszh-fB" + }, + "outputs": [], + "source": [ + "graph_integer = pynutil.insert(\"integer_part: \\\"\") + cardinal + pynutil.insert(\"\\\" \")\n", + "graph_fractional = pynutil.insert(\"fractional_part: \\\"\") + graph_string_of_cardinals + pynutil.insert(\"\\\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bxlnn_7tiQMn" + }, + "source": [ + "We then concatenate them together with a component that recognizes and removes the decimal separator." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BxNS9_AwiWHf" + }, + "outputs": [], + "source": [ + "graph_integer_or_none = graph_integer | pynutil.insert(\"integer_part: \\\"0\\\" \", weight=.1) # In cases we don't always have an integer preceding\n", + "graph_decimal_no_sign = graph_integer_or_none + delete_space + pynutil.delete(\"virgule\") + graph_fractional" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b7uGfsi4i5UI" + }, + "source": [ + "*Note that we allow insertion of 0 if there is no integer to accommodate reading of only decimal values*\n", + "\n", + "Now we allow the possibility of negative values. (Recall French uses \"moins\" to indicate the negative.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VsP79naojQZR" + }, + "outputs": [], + "source": [ + "graph_negative = pynini.cross(\"moins\", \"negative: \\\"-\\\" \") + delete_space\n", + "graph_decimal = graph_negative + graph_decimal_no_sign" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QTcvq5HqllqW" + }, + "outputs": [], + "source": [ + "example = \"moins deux virgule cent-quatre\"\n", + "apply_fst(example, graph_decimal)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FVKuGj_9mZ75" + }, + "source": [ + "Placing within a `DecimalFst` class, we have:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tXwr32ermesp" + }, + "outputs": [], + "source": [ + "class DecimalFst(GraphFst):\n", + " def __init__(self, cardinal: GraphFst):\n", + " super().__init__(name=\"decimal\", kind=\"classify\")\n", + " cardinal = cardinal.numbers_up_to_million\n", + " delete_virgule = pynutil.delete(\"virgule\")\n", + "\n", + " graph_integer = pynutil.insert(\"integer_part: \\\"\") + cardinal + pynutil.insert(\"\\\" \") + delete_space\n", + " graph_integer_or_none = graph_integer | pynutil.insert(\"integer_part: \\\"0\\\" \", weight=.001) # In cases we don't always have an integer preceding\n", + "\n", + " graph_string_of_cardinals = delete_space + cardinal\n", + " graph_string_of_cardinals = pynini.closure(graph_string_of_cardinals, 1)\n", + " graph_fractional = pynutil.insert(\"fractional_part: \\\"\") + graph_string_of_cardinals + pynutil.insert(\"\\\"\")\n", + "\n", + " graph_decimal_no_sign = graph_integer_or_none + pynutil.delete(\"virgule\") + graph_fractional \n", + "\n", + " graph_negative = pynini.cross(\"moins\", \"negative: \\\"-\\\" \") + delete_space\n", + " graph_negative = pynini.closure(graph_negative, 0, 1)\n", + "\n", + " graph_decimal = graph_negative + graph_decimal_no_sign\n", + "\n", + " graph = self.add_tokens(graph_decimal)\n", + " self.fst = graph.optimize()\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gjxI5mEKfHLo" + }, + "source": [ + "### Quantities" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3WuwWPf3py7G" + }, + "source": [ + "Recalling our earlier remarks regarding convention in language use, you may find a need to adjust the DecimalFst when processing specific values. For instance, consider the following equivalencies from English:\n", + "- `1,500,000` = \"one million five hundred thousand\" = \"one point five million\" = `1.5 million`\n", + "- `2,750,000` = \"two million seven hundred and fifty thousand\" = \"two point seven five million\" = `2.75 million`\n", + "\n", + "For large numbers, there is a tendency to use the decimal system as though one is describing a quantity. Notably, there is a minimum value for which this is comfortable. (A speaker of English may say \"three point five trillion\" but \"three point five hundred\" comes off as odd.)\n", + "\n", + "This behavior can occur in other languages. For example, the amount of `$1,500,000` may be read in French as \"une virgule cinq million de dollars\" (\"one point five million dollars\"). " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RgMBIKlYdsGz" + }, + "source": [ + "Our Classifier can be made to accommodate this behavior: we simply need to repeat what we did for `OrdinalFst` and set aside several key terms to trigger our model. For French, we will choose all terms added for values greater than a million. (Chosen empirically.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vEcsUXw5fUEe" + }, + "outputs": [], + "source": [ + "suffix = pynini.union(\n", + " \"million\",\n", + " \"millions\",\n", + " \"milliard\",\n", + " \"milliards\",\n", + " \"billion\",\n", + " \"billions\",\n", + " \"billiard\",\n", + " \"billiards\",\n", + " \"trillion\",\n", + " \"trillions\",\n", + " \"trilliard\",\n", + " \"trilliards\",\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wIIUAsR-fgQA" + }, + "source": [ + "We will then need to use a WFST to graph any numbers the precede these amounts. Note, unlike for our `DecimalFst`, we need to permit cardinals as well as decimals. This is because we want to be able to normalize a phrase like \"three million\" to `3 million` as this will be less obtrusive than `3,000,000`.\n", + "\n", + "As such, we will call a `CardinalFst` and a `DecimalFst` in for `graph_quantities`. Since these are both utilized for our `DecimalFst`, it would be more efficient to just pass them along as function/class variables." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yern-idtycWg" + }, + "outputs": [], + "source": [ + "def get_quantity(decimal, cardinal_up_to_thousand):\n", + " key_values = pynini.union(\n", + " \"million\",\n", + " \"millions\",\n", + " \"milliard\",\n", + " \"milliards\",\n", + " \"billion\",\n", + " \"billions\",\n", + " \"billiard\",\n", + " \"billiards\",\n", + " \"trillion\",\n", + " \"trillions\",\n", + " \"trilliard\",\n", + " \"trilliards\",\n", + " )\n", + " # The French WFST that this borrows from has not removed leading zeroes yet.\n", + " numbers = cardinal_up_to_thousand @ (\n", + " pynutil.delete(pynini.closure(\"0\")) + pynini.difference(NEMO_DIGIT, \"0\") + pynini.closure(NEMO_DIGIT)\n", + " )\n", + " res = (\n", + " pynutil.insert(\"integer_part: \\\"\")\n", + " + numbers\n", + " + pynutil.insert(\"\\\"\")\n", + " + (\n", + " pynini.union(delete_hyphen, delete_extra_space)\n", + " ) # Can be written either as 'deux-millions' or 'deux millions' depending on whether it registers as a noun or part of cardinal.\n", + " + pynutil.insert(\" quantity: \\\"\")\n", + " + suffix\n", + " + pynutil.insert(\"\\\"\")\n", + " )\n", + " # Union with decimal to permit either a cardinal or decimal representation.\n", + " res |= decimal + delete_extra_space + pynutil.insert(\" quantity: \\\"\") + suffix + pynutil.insert(\"\\\"\")\n", + " return res" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uT4LMo8ADBAq" + }, + "source": [ + "We can now insert this into our Classifier, producing the following:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "d2KrCuyGDLwh" + }, + "outputs": [], + "source": [ + "class DecimalFst(GraphFst):\n", + " def __init__(self, cardinal: GraphFst):\n", + " super().__init__(name=\"decimal\", kind=\"classify\")\n", + " quantities_cardinal = cardinal.graph_hundreds_component_at_least_one_none_zero_digit\n", + " cardinal = cardinal.graph_no_exception\n", + " delete_virgule = pynutil.delete(\"virgule\")\n", + "\n", + " graph_integer = pynutil.insert(\"integer_part: \\\"\") + cardinal + pynutil.insert(\"\\\" \") + delete_space\n", + " graph_integer_or_none = graph_integer | pynutil.insert(\"integer_part: \\\"0\\\" \", weight=.001) # In cases we don't always have an integer preceding\n", + "\n", + " graph_string_of_cardinals = delete_space + cardinal\n", + " graph_string_of_cardinals = pynini.closure(graph_string_of_cardinals, 1)\n", + " graph_fractional = pynutil.insert(\"fractional_part: \\\"\") + graph_string_of_cardinals + pynutil.insert(\"\\\"\")\n", + "\n", + " graph_decimal_no_sign = graph_integer_or_none + delete_virgule + graph_fractional \n", + "\n", + " graph_negative = pynini.cross(\"moins\", \"negative: \\\"-\\\" \") + delete_space\n", + " graph_negative = pynini.closure(graph_negative, 0, 1)\n", + " graph_decimal = graph_negative + graph_decimal_no_sign\n", + "\n", + " # Union default decimal with version that accepts quantities\n", + " graph_decimal |= graph_negative + get_quantity(\n", + " graph_decimal_no_sign, quantities_cardinal\n", + " )\n", + " final_graph = self.add_tokens(graph_decimal)\n", + " self.fst = final_graph.optimize()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cD-eKqO6qTyh" + }, + "outputs": [], + "source": [ + "cardinal = CardinalFst()\n", + "decimal = DecimalFst(cardinal).fst\n", + "example = \"trois virgule cent-quatre billion\"\n", + "apply_fst(example, decimal)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HiSLKF3RfRZA" + }, + "source": [ + "## Verbalizer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QnkOV5FlteQA" + }, + "source": [ + "As before, the Verbalizer is responsible for removing the formatting and rendering a given token in conventional form. As the process remains similar to Ordinals and Cardinals (deleting strings in a regular matter) we will instead focus on a unique concern for `DecimalFst`: numeral spacing.\n", + "\n", + "For some writing systems, decimal numbers and other strings are typically not written as a single string, instead using punctuation to group numbers for clarity. For example, in the United States, integer digits greater than a thousand are separated by commas for every three digits:\n", + "- `12345.678` -> `12,345.678`\n", + "\n", + "A similar rule occurs in French, save it employs spaces on each side of the decimal marker:\n", + "- `12345,6789` -> `12 345,678 9`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2h4WQZ1a4Cpc" + }, + "source": [ + "While simple enough, this rule poses a slight complication: it works from the left and right of the decimal separator, whereas WFSTs process linearly from the beginning (or end) of strings. As such we will need to break the formatting rule into two components: one for the integer component and one for the decimal component." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ViOFNdZw4-qu" + }, + "source": [ + "Starting with the integer component, we need our subcomponent to recognize every three digits and insert a space before. We can achieve this with some `graph_utils` helper objects - `NEMO_DIGIT` and `NEMO_NON_BREAKING_SPACE`, which accept all digits and non-breaking spaces, respectively. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z36be2Vo5VbR" + }, + "outputs": [], + "source": [ + "every_three_digits = NEMO_DIGIT ** 3 # accepts a string of three digits\n", + "space_every_three_integer = pynini.closure(NEMO_NON_BREAKING_SPACE + every_three_digits) # inserts space before every three digits." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RSB2gGH-5vwi" + }, + "source": [ + "However, we cannot let the component insert spaces when there are *only* three digits (e.g. `100`.) As such, we need to make sure the insertion only begins starting from the beginning of a string (e.g. when there is a string between one and three digits.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wfWp3ghH6mDQ" + }, + "outputs": [], + "source": [ + "space_every_three_integer = pynini.closure(NEMO_DIGIT, 1, 3) + space_every_three_integer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NJrQYSfA6vyu" + }, + "source": [ + "For the case of the decimal spacing, we simply reverse the logic:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vBP6ncTp6yXX" + }, + "outputs": [], + "source": [ + "space_every_three_decimal = pynini.closure(NEMO_NON_BREAKING_SPACE + every_three_digits)\n", + "space_every_three_decimal = space_every_three_decimal + pynini.closure(NEMO_DIGIT, 1, 3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WRXPN_gk69VV" + }, + "source": [ + "Placed into our Verbalizer, we would see the following:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "h49eztvs7BXH" + }, + "outputs": [], + "source": [ + "class DecimalFst(GraphFst):\n", + " \"\"\"\n", + " Finite state transducer for verbalizing decimal, e.g.\n", + " decimal { negative: \"true\" integer_part: \"12\" fractional_part: \"5006\" quantity: \"billion\" } -> -12.5006 billion\n", + " \"\"\"\n", + "\n", + " def __init__(self):\n", + " super().__init__(name=\"decimal\", kind=\"verbalize\")\n", + "\n", + " # Need parser to group digits by threes\n", + " exactly_three_digits = NEMO_DIGIT ** 3\n", + " at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3)\n", + "\n", + " space_every_three_integer = (\n", + " at_most_three_digits + (pynutil.insert(NEMO_NON_BREAKING_SPACE) + exactly_three_digits).closure()\n", + " )\n", + " space_every_three_decimal = (\n", + " pynini.accep(\",\")\n", + " + (exactly_three_digits + pynutil.insert(NEMO_NON_BREAKING_SPACE)).closure()\n", + " + at_most_three_digits\n", + " )\n", + " group_by_threes = space_every_three_integer | space_every_three_decimal\n", + " self.group_by_threes = group_by_threes\n", + "\n", + " optional_sign = pynini.closure(pynini.cross(\"negative: \\\"true\\\"\", \"-\") + delete_space, 0, 1)\n", + " integer = (\n", + " pynutil.delete(\"integer_part:\")\n", + " + delete_space\n", + " + pynutil.delete(\"\\\"\")\n", + " + pynini.closure(NEMO_NOT_QUOTE, 1)\n", + " + pynutil.delete(\"\\\"\")\n", + " )\n", + " integer = integer @ group_by_threes\n", + " optional_integer = pynini.closure(integer + delete_space, 0, 1)\n", + " fractional = (\n", + " pynutil.insert(\",\")\n", + " + pynutil.delete(\"fractional_part:\")\n", + " + delete_space\n", + " + pynutil.delete(\"\\\"\")\n", + " + pynini.closure(NEMO_NOT_QUOTE, 1)\n", + " + pynutil.delete(\"\\\"\")\n", + " )\n", + " fractional = fractional @ group_by_threes\n", + " optional_fractional = pynini.closure(fractional + delete_space, 0, 1)\n", + " quantity = (\n", + " pynutil.delete(\"quantity:\")\n", + " + delete_space\n", + " + pynutil.delete(\"\\\"\")\n", + " + pynini.closure(NEMO_NOT_QUOTE, 1)\n", + " + pynutil.delete(\"\\\"\")\n", + " )\n", + " optional_quantity = pynini.closure(pynutil.insert(\" \") + quantity + delete_space, 0, 1)\n", + " graph = (optional_integer + optional_fractional + optional_quantity).optimize()\n", + " self.numbers = graph # Saving just the part of the graph used for numbers\n", + " graph = optional_sign + graph\n", + " delete_tokens = self.delete_tokens(graph)\n", + " self.fst = delete_tokens.optimize()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Trying out some examples:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fst = DecimalFst().fst\n", + "\n", + "example1 = 'decimal { integer_part: \"3\" fractional_part: \"10453\" quantity: \"billion\" }'\n", + "example2 = 'decimal { integer_part: \"22323\" fractional_part: \"104553\" }'\n", + "\n", + "apply_fst(example1, fst)\n", + "apply_fst(example2, fst)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CZbshZCW8clI" + }, + "source": [ + "# Money WFST " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xuiv8HMz7yjm" + }, + "source": [ + "Now that we've handled some of the foundational classes, it's time to see how they build up to permit more concrete ones. Let's see how the previous WFSTs assist in building a WFST for normalizing currency: the `MoneyFst`. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wTU2c7MtUpqF" + }, + "source": [ + "## Grammar" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qqyRm8Ru8TDf" + }, + "source": [ + "While the exact phrasing will vary, a valid string for currency will possess the following qualities:\n", + "- A major and/or minor denomination of currency\n", + "- A numeric quantity of the denomination \n", + "\n", + "As our `CardinalFst` and `OrdinalFst` already allow us to normalize the quantity, the only issue for `MoneyFst` is to graph the amounts and build a vocabulary to recognize the denominations.\n", + "\n", + "For French, we will use the following examples to build upon:\n", + "- \"une euros\" -> `1 €`\n", + "- \"deux euros\" -> `2 €` \n", + "- \"deux euros cinq\" -> `2,5 €` \n", + "- \"cinq centimes\" -> `0,5 €`\n", + "- \"deux billions de euros\" -> `2 billions de euros`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FMqUir9n9_cA" + }, + "source": [ + "These suggest the following requirements of our grammar:\n", + "- There must be a mapping between \"euro\" and \"centime\" and `€` in our vocabulary\n", + "- This mapping must allow both singular and plural forms\n", + "- The currency denomination is phrased between major and minor denominations (\"une euro cinq\" and not \"une cinq euro\")\n", + "- Large quantities of currency are left 'as is' instead of normalized\n", + "\n", + "We may deal with the vocabulary in the typical fashion:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XN9nbNhB-vEV" + }, + "outputs": [], + "source": [ + "major_currency = pynini.string_map([(\"euro\", \"€\")])\n", + "minor_currency = pynini.string_map([(\"centime\", \"€\")])\n", + "\n", + "graph_plural = pynutil.delete(\"s\").ques\n", + "\n", + "major_currency += graph_plural\n", + "minor_currency += graph_plural" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3aHrm1qPAc-f" + }, + "source": [ + "Moving to the numbers, note that we need to append a leading zero to the value of fractional currency amounts (\"five cents\" -> `$0.05`). We bring back the subgraph from `CardinalFst` that maps tokens to numbers without tokenization to assist with this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jwi-yQW1AjvG" + }, + "outputs": [], + "source": [ + "from nemo_text_processing.inverse_text_normalization.fr.taggers import cardinal\n", + "\n", + "cardinal_graph = cardinal.CardinalFst()\n", + "graph_cardinal = cardinal_graph.graph_no_exception # graphs cardinals w/o tokenization\n", + "\n", + "add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert(\"0\") + NEMO_DIGIT)\n", + "graph_fractional_values = graph_cardinal @ add_leading_zero_to_double_digit" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let us consider how to manage arge quantities of currency. In our example (\"deux billions de euros\" -> `2 billions de euros`) we see that its behavior mirrors that of our `get_quantity` portion of `DecimalFst`. As such, it would be useful if there was a subcomponent of that graph that we could use in here. Like in the case of `CardinalFst`, let us go back and create a subgraph for later use. Since all our quantities are positive, this would be best accomplished right before incorporating the `negative` property, creating a `self.final_graph_wo_negative`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class DecimalFst(GraphFst):\n", + " def __init__(self, cardinal: GraphFst):\n", + " super().__init__(name=\"decimal\", kind=\"classify\")\n", + " quantities_cardinal = cardinal.graph_hundreds_component_at_least_one_none_zero_digit\n", + " cardinal = cardinal.graph_no_exception\n", + " delete_virgule = pynutil.delete(\"virgule\")\n", + "\n", + " graph_integer = pynutil.insert(\"integer_part: \\\"\") + cardinal + pynutil.insert(\"\\\" \") + delete_space\n", + " graph_integer_or_none = graph_integer | pynutil.insert(\"integer_part: \\\"0\\\" \", weight=.001) # In cases we don't always have an integer preceding\n", + "\n", + " graph_string_of_cardinals = delete_space + cardinal\n", + " graph_string_of_cardinals = pynini.closure(graph_string_of_cardinals, 1)\n", + " graph_fractional = pynutil.insert(\"fractional_part: \\\"\") + graph_string_of_cardinals + pynutil.insert(\"\\\"\")\n", + "\n", + " graph_decimal_no_sign = graph_integer_or_none + delete_virgule + graph_fractional \n", + "\n", + " ### NEW GRAPH HERE\n", + " self.final_graph_wo_negative = graph_decimal_no_sign | get_quantity(\n", + " final_graph_wo_sign, cardinal.graph_hundreds_component_at_least_one_none_zero_digit\n", + " )\n", + " \n", + " graph_negative = pynini.cross(\"moins\", \"negative: \\\"-\\\" \") + delete_space\n", + " graph_negative = pynini.closure(graph_negative, 0, 1)\n", + " graph_decimal = graph_negative + graph_decimal_no_sign\n", + "\n", + " # Union default decimal with version that accepts quantities\n", + " graph_decimal |= graph_negative + get_quantity(\n", + " graph_decimal_no_sign, quantities_cardinal\n", + " )\n", + " final_graph = self.add_tokens(graph_decimal)\n", + " self.fst = final_graph.optimize()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Allowing us to change our grammar to:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_text_processing.inverse_text_normalization.fr.taggers import cardinal, decimal\n", + "\n", + "cardinal_graph = cardinal.CardinalFst()\n", + "decimal_graph = decimal.DecimalFst(cardinal_graph)\n", + "\n", + "graph_cardinal = cardinal_graph.graph_no_exception # graphs cardinals w/o tokenization\n", + "graph_decimal = decimal_graph.final_graph_wo_negative # graphs positive decimals w/o tokenization\n", + "\n", + "add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert(\"0\") + NEMO_DIGIT)\n", + "graph_fractional_values = graph_cardinal @ add_leading_zero_to_double_digit" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L1RHoW-TLzIz" + }, + "source": [ + "Note that by doing this, we're also incorporating the formatting from the `decimal` class up to this point. Since these overlap with the `money` class (see next section), we have saved ourselves some work. \n", + "\n", + "Since we already made `graph_quantity` part of our `DecimalFst`, we can avoid dealing with large quantities now. However, this does mean we still need a way to leave currencies 'as is' without normalization. We can do this by using the `project` method, which will create a WFST that excepts either all valid inputs or all valid outputs of another WFST (depending on argument)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7l_TLtJkMluU" + }, + "outputs": [], + "source": [ + "major_currency_no_normalize = major_currency.project(\"input\")\n", + "apply_fst(\"euro\", major_currency_no_normalize)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "raBdHc_WXEpG" + }, + "source": [ + "We then append this WFST with a WFST that recognizes prepositions commonly used before large values of currency (\"d'\", \"des\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CEuxiVgDXRBf" + }, + "outputs": [], + "source": [ + "graph_preposition = pynini.union(\"des \", \"d'\") # Used for large amounts (billions de euros)\n", + "major_currency_no_normalize = pynini.closure(graph_preposition, 0, 1) + major_currency.project(\"input\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FlXmf8Fq_Rm1" + }, + "source": [ + "## Classifier" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T5BBuQRzLuXS" + }, + "source": [ + "For the Money semiotic class, we have available the following properties for tokenization:\n", + "- `integer_part`\n", + "- `fractional_part` \n", + "- `currency`\n", + "\n", + "Laying the initial groundwork seems simple enough. We first instantiate our `MoneyFst` classifier with our initial grammars:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EZaCeHcFWVP3" + }, + "outputs": [], + "source": [ + "class MoneyFst(GraphFst):\n", + " def __init__(self, cardinal: GraphFst, decimal: GraphFst):\n", + " super().__init__(name=\"money\", kind=\"classify\")\n", + " major_currency = pynini.string_map([(\"euro\", \"€\")])\n", + " minor_currency = pynini.string_map([(\"centime\", \"€\")])\n", + "\n", + " graph_plural = pynutil.delete(\"s\").ques\n", + "\n", + " major_currency += graph_plural\n", + " minor_currency += graph_plural\n", + "\n", + " major_currency_no_normalize = major_currency.project(\"input\")\n", + " graph_preposition = pynini.union(\"des \", \"d'\") # Used for large amounts (billions de euros)\n", + " major_currency_no_normalize = graph_preposition + major_currency.project(\"input\")\n", + "\n", + " graph_cardinal = cardinal.graph_no_exception\n", + " graph_decimal = decimal.final_graph_wo_negative\n", + "\n", + " add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert(\"0\") + NEMO_DIGIT)\n", + " graph_fractional_values = graph_cardinal @ add_leading_zero_to_double_digit" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_bpkXroLWaBo" + }, + "source": [ + "Let us now manage the `currency` property. We have the following scenarios to consider:\n", + "- Major denomination only\n", + "- Minor denomination only\n", + "- Major denomination and implicit minor denomination (\"cinq euro trois\")\n", + "- Major denomination and explicit minor denomination (\"cinq euros et trois centimes\")\n", + "- Large quantities of euros (\"cinq billion des euros\")\n", + "\n", + "Note how across cases the use of `graph_cardinal` and `graph_decimal` will be applied differently. Further, we may have varying orders in which tags are assigned proper values. For instance, if we have only minor denomination we would assign `fractional_part` before `currency`. Meanwhile, major denomination and implicit minor denomination would be the order of `integer_part`, `currency`, `fractional_part`. While we could try and figure out a way to preserve order, recall that the use of permutations in NeMo ITN makes that unnecessary: we can assume the desired order of tags reach our Verbalizer without make overt efforts in our Classifier! \n", + "\n", + "For example, let's say we need to process \"five dollars\" as `$5.00`. Processed linearly, we could get a token sequence along the lines of: `{ integer_part: \"5\" currency: \"$\" }`. If we passed this token array straight to a Verbalizer, we would need to configure a graph that effectively reverses the order so we could parse the `currency` field prior to the `integer_part` field, perhaps something along the lines of: \n", + "\n", + "`pynutil.insert(\"$\") + delete_space + pynutil.delete('integer_part: \\\"') +.... + pynutil.delete('currency: \"$\"')`\n", + "\n", + "But since NeMo creates permutations of our Classifier outputs, this is unnecessary. We can simply assume whatever would be the most convenient order for us (e.g. `{ currency: \"$\" integer_part: \"5\" }`) and build our Verbalizer around that:\n", + "\n", + "`pynutil.delete('currency: \\\"') + NEMO_SIGMA + pynutil.delete('\\\" integer_part: \\\"') + NEMO_DIGIT +...`\n", + "\n", + "Along with helping to keep our script simpler (we can focus simply on tokenization and not worry about what input order our Verbalizers will accept), this also allows us to overcome structural constraints of WFSTs, namely that they are [limited in reordering text strings](https://en.wikipedia.org/wiki/Pushdown_automaton)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fMZ13D2Dh9ZF" + }, + "source": [ + "Keeping this in mind, let's begin mapping the proper tags. Since they're relatively simple, we can start with only major and minor denominations:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EtwWLp7VbbjM" + }, + "outputs": [], + "source": [ + "graph_integer_component = pynutil.insert(\"integer_part: \\\"\") + graph_cardinal + pynutil.insert(\"\\\"\")\n", + "graph_fractional_component = pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\")\n", + "\n", + "graph_major_currency = pynutil.insert(\" currency: \\\"\") + major_currency + pynutil.insert(\"\\\"\")\n", + "graph_minor_currency = pynutil.insert(\" currency: \\\"\") + minor_currency + pynutil.insert(\"\\\"\")\n", + "\n", + "graph_only_major_money = graph_integer_component + delete_space + graph_major_currency\n", + "graph_only_minor_money = graph_fractional_component + delete_space + graph_minor_currency " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XTmxrK4DmS39" + }, + "source": [ + "Now we may append the case of an implicit `fractional_part` to `graph_only_major_money`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Zvzn3pQinkT0" + }, + "outputs": [], + "source": [ + "implicit_fractional_part = delete_space + pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\") \n", + "implicit_fractional_part = pynini.closure(implicit_fractional_part, 0, 1) " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tKFZkCVmn1OX" + }, + "source": [ + "And the explicit fractional portion:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "d_h0pTlMn3jz" + }, + "outputs": [], + "source": [ + "delete_et = pynutil.delete(\"et \") # Sometimes prefaces the minor currency\n", + "delete_et = pynini.closure(delete_et, 0 , 1)\n", + "\n", + "delete_minor = pynutil.delete(minor_currency.project(\"input\")) # to remove the minor currency\n", + "\n", + "explicit_fractional_part = pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\") \n", + "explicit_fractional_part = delete_space + delete_et + explicit_fractional_part + delete_space + delete_minor\n", + "explicit_fractional_part = pynini.closure(explicit_fractional_part, 0, 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rvnpAudgo-o3" + }, + "source": [ + "We join them together:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qYzlIRWTpD8e" + }, + "outputs": [], + "source": [ + "graph_major_money = graph_only_major_money + (implicit_fractional_part | explicit_fractional_part)\n", + "graph_standard_money = graph_major_money | graph_only_minor_money" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TzeaKXVzpYs8" + }, + "source": [ + "Finishing with the case the large quantities of money, we need to use `graph_decimal` so we can exploit its ability to map quantities. Note that since we are using a pre-existing WFST, we can ignore inserting the tags ourselves, since this is already done by the Decimal WFST. As long as we remember to process this aspect with our Verbalizer, we can spare ourselves the extra step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LnqX9mGFpmJm" + }, + "outputs": [], + "source": [ + "graph_large_money = pynutil.insert(\" currency: \\\"\") + major_currency_no_normalize + pynutil.insert(\"\\\"\")\n", + "graph_large_money = graph_decimal + delete_space + graph_large_money" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "24TUZnJKqgPA" + }, + "source": [ + "Alltogether, this would give the following Classifier:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "B7-muCO2qizg" + }, + "outputs": [], + "source": [ + "class MoneyFst(GraphFst):\n", + " def __init__(self, cardinal: GraphFst, decimal: GraphFst):\n", + " super().__init__(name=\"money\", kind=\"classify\")\n", + " major_currency = pynini.string_map([(\"euro\", \"€\")])\n", + " minor_currency = pynini.string_map([(\"centime\", \"€\")])\n", + "\n", + " graph_plural = pynutil.delete(\"s\").ques\n", + "\n", + " major_currency += graph_plural\n", + " minor_currency += graph_plural\n", + "\n", + " major_currency_no_normalize = major_currency.project(\"input\")\n", + " graph_preposition = pynini.union(\"des \", \"d'\") # Used for large amounts (billions de euros)\n", + " major_currency_no_normalize = graph_preposition + major_currency.project(\"input\")\n", + "\n", + " graph_cardinal = cardinal.graph_no_exception\n", + " graph_decimal = decimal.final_graph_wo_negative\n", + "\n", + " add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert(\"0\") + NEMO_DIGIT)\n", + " graph_fractional_values = graph_cardinal @ add_leading_zero_to_double_digit\n", + "\n", + " graph_integer_component = pynutil.insert(\"integer_part: \\\"\") + graph_cardinal + pynutil.insert(\"\\\"\")\n", + " graph_fractional_component = pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\")\n", + "\n", + " graph_major_currency = pynutil.insert(\" currency: \\\"\") + major_currency + pynutil.insert(\"\\\"\")\n", + " graph_minor_currency = pynutil.insert(\" currency: \\\"\") + minor_currency + pynutil.insert(\"\\\"\")\n", + "\n", + " graph_only_major_money = graph_integer_component + delete_space + graph_major_currency\n", + " graph_only_minor_money = graph_fractional_component + delete_space + graph_minor_currency \n", + "\n", + " implicit_fractional_part = delete_space + pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\") \n", + " implicit_fractional_part = pynini.closure(implicit_fractional_part, 0, 1) \n", + "\n", + "\n", + " delete_et = pynutil.delete(\"et \") # Sometimes prefaces the minor currency\n", + " delete_et = pynini.closure(delete_et, 0 , 1)\n", + "\n", + " delete_minor = pynutil.delete(minor_currency.project(\"input\")) # to remove the minor currency\n", + "\n", + " explicit_fractional_part = pynutil.insert(\"fractional_part: \\\"\") + graph_fractional_values + pynutil.insert(\"\\\"\") \n", + " explicit_fractional_part = delete_space + delete_et + explicit_fractional_part + delete_space + delete_minor\n", + " explicit_fractional_part = pynini.closure(explicit_fractional_part, 0, 1)\n", + "\n", + " graph_major_money = graph_only_major_money + (implicit_fractional_part | explicit_fractional_part)\n", + "\n", + " graph_large_money = pynutil.insert(\" currency: \\\"\") + major_currency_no_normalize + pynutil.insert(\"\\\"\")\n", + " graph_large_money = graph_decimal + delete_space + graph_large_money\n", + "\n", + " final_graph = graph_large_money | graph_major_money | graph_only_minor_money\n", + "\n", + " final_graph = self.add_tokens(final_graph)\n", + " self.fst = final_graph.optimize()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see the results:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_text_processing.inverse_text_normalization.fr.taggers import decimal, cardinal\n", + "\n", + "cardFst = cardinal.CardinalFst()\n", + "decFst = decimal.DecimalFst(cardFst)\n", + "\n", + "moneyFst = MoneyFst(cardFst, decFst).fst\n", + "\n", + "example = \"douze virgule cinq billions d'euros\"\n", + "\n", + "apply_fst(example, moneyFst)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gxdcyuLmAZZa" + }, + "source": [ + "## Verbalizer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZZFDWNwY6sOG" + }, + "source": [ + "By this point, the creation of the Verbalizer should be rather straight-forward - delete the expected tokens and perform any specific formatting that was not caught by the Classifier. \n", + "\n", + "In fact, it is so straight-forward that much of the work does not even need to be explicitly managed by the Verbalizer. As mentioned previously, two of the properties we inserted in our Classifier where already referenced in our `DecimalFst` - `integer_part` and `fractional_part`. We even went so far to directly call a component of `DecimalFst` in our Classifier. As such, outside of the `currency` property - there is little in our Money token that is different from a standard Decimal token. Indeed, even the normalized forms are similar (`200,5` vs. `200,5 €`.) " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T7sgH0t79tmU" + }, + "source": [ + "Given these similarities, it seems that we can save ourselves some work and simply use the Decimal Verbalizer to manage much of the normalization. Let's look at the basic format of our `MoneyFst` verbalizer, writing it so it accepts a `DecimalFst` as input:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BEu8nITP9mSG" + }, + "outputs": [], + "source": [ + "class MoneyFst(GraphFst):\n", + " def __init__(self, decimal: GraphFst):\n", + " super().__init__(name=\"money\", kind=\"verbalize\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JYVLou5N-Dk8" + }, + "source": [ + "We manage the issue of deleting the `currency` property:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LO35tJ7G-H6N" + }, + "outputs": [], + "source": [ + "class MoneyFst(GraphFst):\n", + " def __init__(self, decimal: GraphFst):\n", + " super().__init__(name=\"money\", kind=\"verbalize\")\n", + " unit = (\n", + " pynutil.delete(\"currency:\")\n", + " + delete_extra_space\n", + " + pynutil.delete(\"\\\"\")\n", + " + pynini.closure(NEMO_NOT_QUOTE, 1)\n", + " + pynutil.delete(\"\\\"\")\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDS8XSII-Dpd" + }, + "source": [ + "Now consider, we need to normalize an integer component, a fractional component, and a decimal to separate them. Since NeMo will automatically permutate all tags, we can assume whatever order we want. As such, we can assume we get the exact order that is accepted by our `DecimalFst`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VtGfpjVA-r3u" + }, + "outputs": [], + "source": [ + " def __init__(self, decimal: GraphFst):\n", + " super().__init__(name=\"money\", kind=\"verbalize\")\n", + " unit = (\n", + " pynutil.delete(\"currency:\")\n", + " + delete_extra_space\n", + " + pynutil.delete(\"\\\"\")\n", + " + pynini.closure(NEMO_NOT_QUOTE, 1)\n", + " + pynutil.delete(\"\\\"\")\n", + " )\n", + " graph = decimal.numbers + delete_space + unit\n", + " delete_tokens = self.delete_tokens(graph)\n", + " self.fst = delete_tokens.optimize()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZefxZLIU-uRU" + }, + "source": [ + "It is as simple and compact as appending the `unit` component to the preexisting `decimal.numbers`. \n", + "\n", + "This feature is worth keeping in mind as you build up to more concrete classes: the combination of guaranteed tag permutations and prebuilt Verbalizers make the addition of semiotic classes progressively simpler despite the building complexity of your entire grammar." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WydC7Cn28l5Y" + }, + "source": [ + "# Time WFST " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VelunbumCJJe" + }, + "source": [ + "Our next composite graph will be for the Time WFST. Here, you may see more variation between your language and our example than with our previous classes. This is for a number of reasons, among them being that while there may be some standard cross linguistic patterns regarding time (e.g. `quantity_of_hours + quantity_of_minutes`), the use of various equivalent phrases can make an exhaustive grammar incredibly specific (e.g. consider managing \"twelve fifteen\", \"twelve and a quarter\", \"quarter past twelve\", \"quarter after twelve\", and \"forty five until one\" all together). You may find yourself drawing upon WFSTs that accommodate Cardinals, Fractions, and some basic subtraction.\n", + "\n", + "As such, we are going to focus on those aspects of the Time WFST that are necessary for a functional normalization of time related phrases, saving a more exhaustive grammar for your own specific languages and use cases." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8wqb28wzATOR" + }, + "source": [ + "## Grammar" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AVntDM3AEz0v" + }, + "source": [ + "For our Time WFST, we will focus on the following aspects:\n", + "- Use of 24 or 12 hour base\n", + "- Use of fraction terminology (e.g. \"quarter\" = `15`)\n", + "- Accommodation of key-words (\"noon\", \"midnight\")\n", + "- Counting backwards from the hour (\"ten to five\", \"five to three\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "seU9hTbgFgu7" + }, + "source": [ + "We'll start with the basic system.\n", + "\n", + "For French, time operates on a twenty-four hour system, with the zeroth hour being midnight. Time is given in the following format:\n", + "\n", + "`cardinal + heure(s) + (cardinal)` \n", + "\n", + "This is normalized as:\n", + "\n", + "`cardinal h (cardinal)`\n", + "\n", + "For instance, for `3:03`, we would have:\n", + "- input: \"trois heures trois\"\n", + "- output: `3 h 03`\n", + "\n", + "As such, our grammar needs to utilize a Cardinal WFST and have a means to accept \"heures\" from the input. Taking care of the latter case is simple enough:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HTSVxf4fI_ND" + }, + "outputs": [], + "source": [ + "graph_heures = pynini.accep(\"heure\") + pynini.accep(\"s\").ques\n", + "graph_heures = pynutil.delete(graph_heures)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6LW7pXaXJSZa" + }, + "source": [ + "For the cardinals, we could pass an instance of `CardinalFST` to our graph. But do we really need that level of coverage? We only really need to cover the numbers 0 - 60, which we could simply write a new WFST for. Further, it may be beneficial to allow our graph to separate possible ambiguity. While we will not cover it in our tutorial, you may in the future find it necessary to build a WFST for Measurements, of which quantities of time may play a part. Would it not be helpful for you WFST to know that \"thirty hours\" could only ever be a measurement instead of a possible time of day?\n", + "\n", + "Given the little amount of effort necessary and the quick benefit, we choose to make our hours and minutes explicit in the Time WFST." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R4aa06ZPLKIR" + }, + "outputs": [], + "source": [ + "hours = pynini.string_map([\n", + " (\"zéro\",\"0\"),\n", + " (\"une\",\"1\"),\n", + " (\"deux\",\"2\"),\n", + " (\"trois\",\"3\"),\n", + " (\"quatre\",\"4\"),\n", + " (\"cinq\",\"5\"),\n", + " (\"six\",\"6\"),\n", + " (\"sept\",\"7\"),\n", + " (\"huit\",\"8\"),\n", + " (\"neuf\",\"9\"),\n", + " (\"dix\",\"10\"),\n", + " (\"onze\",\"11\"),\n", + " (\"douze\",\"12\"),\n", + " (\"treize\",\"13\"),\n", + " (\"quatorze\",\"14\"),\n", + " (\"quinze\",\"15\"),\n", + " (\"seize\",\"16\"),\n", + " (\"dix-sept\",\"17\"),\n", + " (\"dix-huit\",\"18\"),\n", + " (\"dix-neuf\",\"19\"),\n", + " (\"vingt\",\"20\"),\n", + " (\"vingt-et-une\",\"21\"),\n", + " (\"vingt et une\",\"21\"),\n", + " (\"vingt-deux\",\"22\"),\n", + " (\"vingt-trois\",\"23\"),\n", + " (\"vingt-quatre\",\"24\"),\n", + "])\n", + "minutes = pynini.string_map([\n", + " (\"une\", \"01\"),\n", + " (\"deux\", \"02\"),\n", + " (\"trois\", \"03\"),\n", + " (\"quatre\", \"04\"),\n", + " (\"cinq\", \"05\"),\n", + " (\"six\", \"06\"),\n", + " (\"sept\", \"07\"),\n", + " (\"huit\", \"08\"),\n", + " (\"neuf\", \"09\"),\n", + " (\"dix\", \"10\"),\n", + " (\"onze\", \"11\"),\n", + " (\"douze\", \"12\"),\n", + " (\"treize\", \"13\"),\n", + " (\"quatorze\", \"14\"),\n", + " (\"quinze\", \"15\"),\n", + " (\"seize\", \"16\"),\n", + " (\"dix-sept\", \"17\"),\n", + " (\"dix-huit\", \"18\"),\n", + " (\"dix-neuf\", \"19\"),\n", + " (\"vingt\", \"20\"),\n", + " (\"vingt-et-une\", \"21\"),\n", + " (\"vingt et une\", \"21\"),\n", + " (\"vingt-deux\", \"22\"),\n", + " (\"vingt-trois\", \"23\"),\n", + " (\"vingt-quatre\", \"27\"),\n", + " (\"vingt-cinq\", \"25\"),\n", + " (\"vingt-six\", \"26\"),\n", + " (\"vingt-sept\", \"27\"),\n", + " (\"vingt-huit\", \"28\"),\n", + " (\"vingt-neuf\", \"29\"),\n", + " (\"trente\", \"30\"),\n", + " (\"trente-et-une\", \"31\"),\n", + " (\"trente et une\", \"31\"),\n", + " (\"trente-deux\", \"32\"),\n", + " (\"trente-trois\", \"33\"),\n", + " (\"trente-quatre\", \"34\"),\n", + " (\"trente-cinq\", \"35\"),\n", + " (\"trente-six\", \"36\"),\n", + " (\"trente-sept\", \"37\"),\n", + " (\"trente-huit\", \"38\"),\n", + " (\"trente-neuf\", \"39\"),\n", + " (\"quarante\", \"40\"),\n", + " (\"quarante-et-une\", \"41\"),\n", + " (\"quarante et une\", \"41\"),\n", + " (\"quarante-deux\", \"42\"),\n", + " (\"quarante-trois\", \"43\"),\n", + " (\"quarante-quatre\", \"44\"),\n", + " (\"quarante-cinq\", \"45\"),\n", + " (\"quarante-six\", \"46\"),\n", + " (\"quarante-sept\", \"47\"),\n", + " (\"quarante-huit\", \"48\"),\n", + " (\"quarante-neuf\", \"49\"),\n", + " (\"cinquante\", \"50\"),\n", + " (\"cinquante-et-une\", \"51\"),\n", + " (\"cinquante et une\", \"51\"),\n", + " (\"cinquante-deux\", \"52\"),\n", + " (\"cinquante-trois\", \"53\"),\n", + " (\"cinquante-quatre\", \"54\"),\n", + " (\"cinquante-cinq\", \"55\"),\n", + " (\"cinquante-six\", \"56\"),\n", + " (\"cinquante-sept\", \"57\"),\n", + " (\"cinquante-huit\", \"58\"),\n", + " (\"cinquante-neuf\", \"59\"),\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4SmNsNKLM9cC" + }, + "source": [ + "Now that we've managed the basic graph, we can address some of the more niche rules of French timekeeping.\n", + "\n", + "To start, French employs some colloquialisms that will be familiar to English speakers: minutes that are multiples of fifteen are referred to as fractions of a clock. In particular:\n", + "- `5 h 15` -> \"cinq heures **et quart**\"\n", + "- `5 h 30` -> \"cinq heures **et demie**\"\n", + "- `5 h 45` -> \"cinq eures **et trois quarts**\"\n", + "\n", + "We thus need a means of rendering these as their numerical equivalents:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xHe3nfrpSlrE" + }, + "outputs": [], + "source": [ + "# Mapping 'et demi' and 'et qart'\n", + "graph_et = pynutil.delete(\"et\") + delete_space\n", + "\n", + "graph_demi = pynini.accep(\"demi\")\n", + "graph_demi += pynini.accep(\"e\").ques # people vary on feminine or masculine form\n", + "graph_demi = pynini.cross(graph_demi, \"30\")\n", + "\n", + "graph_quart = pynini.accep('quart')\n", + "graph_quart = pynini.cross(graph_quart, '15')\n", + "graph_trois_quart = pynini.cross(\"trois quarts\", \"45\")\n", + "\n", + "graph_fractions = graph_demi | graph_quart | graph_trois_quart\n", + "graph_fractions = graph_et + graph_fractions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HD2wobIQS3fX" + }, + "source": [ + "Also like English, French will use key words to designate a specific timeslot. Noon and midnight are \"midi\" and \"minuit\" respectively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ahbkiZFuTN2t" + }, + "outputs": [], + "source": [ + "# Midi and minuit\n", + "graph_midi = pynini.cross(\"midi\", \"12\")\n", + "graph_minuit = pynini.cross(\"minuit\", \"0\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6OyMoqfZTX1U" + }, + "source": [ + "Now it's time to throw a wrench into things: counting backwards from the hour. How are we to get what is essentially a graph to do the subtraction necessarily for \"ten to twelve\" to become `11:50`?\n", + "\n", + "Easy: we build the subtraction into the graph itself. That is, we map the hours and minutes produced by our graph onto another graph that produces their amount shifted back a value.\n", + "\n", + "Let's take our \"ten to twelve\" example. Normally \"ten\" would map to `10` and \"twelve\" to `12`. But with these new graphs, the detection of the pattern `minute + to + hour` would signal that `10` should now become `50` and `12` become `11`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uMWifbm1VQjP" + }, + "source": [ + "Let us do this for our French example. Luckily enough, the indication that a French string is regular: counting backwards from the hour is by use of the pattern `cardinal + heures + moins + minutes`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c4bV3T1pViCH" + }, + "outputs": [], + "source": [ + "hours_to = pynini.string_map([\n", + " (\"1\",\"0\"),\n", + " (\"2\",\"1\"),\n", + " (\"3\",\"2\"),\n", + " (\"4\",\"3\"),\n", + " (\"5\",\"4\"),\n", + " (\"6\",\"5\"),\n", + " (\"7\",\"6\"),\n", + " (\"8\",\"7\"),\n", + " (\"9\",\"8\"),\n", + " (\"10\",\"9\"),\n", + " (\"11\",\"10\"),\n", + " (\"12\",\"11\"),\n", + " (\"13\",\"12\"),\n", + " (\"14\",\"13\"),\n", + " (\"15\",\"14\"),\n", + " (\"16\",\"15\"),\n", + " (\"17\",\"16\"),\n", + " (\"18\",\"17\"),\n", + " (\"19\",\"18\"),\n", + " (\"20\",\"19\"),\n", + " (\"21\",\"20\"),\n", + " (\"22\",\"21\"),\n", + " (\"23\",\"22\"),\n", + " (\"24\",\"23\"),\n", + " (\"0\",\"23\"),\n", + "])\n", + "minutes_to = pynini.string_map([\n", + " (\"59\", \"01\"),\n", + " (\"58\", \"02\"),\n", + " (\"57\", \"03\"),\n", + " (\"56\", \"04\"),\n", + " (\"55\", \"05\"),\n", + " (\"54\", \"06\"),\n", + " (\"53\", \"07\"),\n", + " (\"52\", \"08\"),\n", + " (\"51\", \"09\"),\n", + " (\"50\", \"10\"),\n", + " (\"49\", \"11\"),\n", + " (\"48\", \"12\"),\n", + " (\"47\", \"13\"),\n", + " (\"46\", \"14\"),\n", + " (\"45\", \"15\"),\n", + " (\"44\", \"16\"),\n", + " (\"43\", \"17\"),\n", + " (\"42\", \"18\"),\n", + " (\"41\", \"19\"),\n", + " (\"40\", \"20\"),\n", + " (\"39\", \"21\"),\n", + " (\"38\", \"22\"),\n", + " (\"37\", \"23\"),\n", + " (\"36\", \"24\"),\n", + " (\"35\", \"25\"),\n", + " (\"34\", \"26\"),\n", + " (\"33\", \"27\"),\n", + " (\"32\", \"28\"),\n", + " (\"31\", \"29\"),\n", + " (\"30\", \"30\"),\n", + " (\"29\", \"31\"),\n", + " (\"28\", \"32\"),\n", + " (\"27\", \"33\"),\n", + " (\"26\", \"34\"),\n", + " (\"25\", \"35\"),\n", + " (\"24\", \"36\"),\n", + " (\"23\", \"37\"),\n", + " (\"22\", \"38\"),\n", + " (\"21\", \"39\"),\n", + " (\"20\", \"40\"),\n", + " (\"19\", \"41\"),\n", + " (\"18\", \"42\"),\n", + " (\"17\", \"43\"),\n", + " (\"16\", \"44\"),\n", + " (\"15\", \"45\"),\n", + " (\"14\", \"46\"),\n", + " (\"13\", \"47\"),\n", + " (\"12\", \"48\"),\n", + " (\"11\", \"49\"),\n", + " (\"10\", \"50\"),\n", + " (\"09\", \"51\"),\n", + " (\"08\", \"52\"),\n", + " (\"07\", \"53\"),\n", + " (\"06\", \"54\"),\n", + " (\"05\", \"55\"),\n", + " (\"04\", \"56\"),\n", + " (\"03\", \"57\"),\n", + " (\"02\", \"58\"),\n", + " (\"01\", \"59\"),\n", + "])\n", + "graph_moins = pynutil.delete(\"moins\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XOKETkIYZy5M" + }, + "source": [ + "Why graph the digits instead of the tokens themselves? Along with avoiding some minor repetition and making editing more apparent, it allows this subgraph to be ported to other languages - if so desired.\n", + "\n", + "Further, it helps us illustrate a helpful idea within this tutorial: as long as a pattern is regular and/or finite, it is no major issue to accommodate it in our graph, regardless of mathematic or logic system it employs." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DJbFiD2fAUc5" + }, + "source": [ + "## Classifier" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cK0SGXntaDkI" + }, + "source": [ + "Once again we place the grammar within the proper child class of `GraphFst`. We also insert the proper tags for the `Time` class, which are:\n", + "- `hours`\n", + "- `minutes`\n", + "- `suffix` (explained within this section)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9Eq5r-_VbBIg" + }, + "outputs": [], + "source": [ + "graph_hours_component = pynini.union(hours, graph_midi, graph_minuit)\n", + "graph_hours_component = pynutil.insert(\"hours: \\\"\") + graph_hours_component + pynutil.insert(\"\\\"\")\n", + "\n", + "graph_minutes_component = (\n", + " pynutil.insert(\" minutes: \\\"\") + pynini.union(minutes, graph_fractions) + pynutil.insert(\"\\\"\")\n", + ") \n", + "graph_minutes_component = delete_space + graph_minutes_component\n", + "\n", + "graph_time_standard = (graph_hours_component + delete_space + graph_heures \n", + " + pynini.closure(graph_minutes_component, 0, 1))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2avfS3IacSiC" + }, + "source": [ + "We now setup the alternate graph that allows backwards counting. Note, this is triggered by the occurrence of \"moins\" between the hour and minute component." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TmpwisOVcn0T" + }, + "outputs": [], + "source": [ + "graph_hours_to_component = hours | graph_midi | graph_minuit\n", + "graph_hours_to_component @= hours_to\n", + "graph_hours_to_component = pynutil.insert(\"hours: \\\"\") + graph_hours_to_component + pynutil.insert(\"\\\"\")\n", + "graph_hours_to_component = graph_hours_to_component + delete_space + graph_heures\n", + "\n", + "graph_minutes_to_component = (minutes | graph_demi | # No 'et' in fractions\n", + " (pynutil.delete(\"le \") + graph_quart) | graph_trois_quart)\n", + "graph_minutes_to_component @= minutes_to\n", + "graph_minutes_to_component = pynutil.insert(\" minutes: \\\"\") + graph_minutes_to_component + pynutil.insert(\"\\\"\")\n", + "\n", + "graph_time_to = graph_hours_to_component + delete_space + graph_moins + delete_space + graph_minutes_to_component" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FkO4tRRfdQT4" + }, + "source": [ + "We now join it with our main component, allowing us to graph all times:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0O0vUVizdU8c" + }, + "outputs": [], + "source": [ + "graph_time = graph_time_standard | graph_time_to" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jbX4JV-LdY3Y" + }, + "source": [ + "Once again we throw a wrench into things with the `suffix` feature. As in the case of Ordinals and Decimals, key-words can play into our Time WFST. For French, this occurs with the words \"du matin\", \"de l'après-midi\", and \"du soir\". (Respectively: \"in the morning\", \"in the afternoon\", and \"in the evening\".) Much like in English, these phrases alter how we write down the time. But instead of indicating `a.m.` or `p.m.`, these indicate *what hour system is used*. For example:\n", + "- \"deux heures du matin\" -> `2 h` = `2:00 a.m.`\n", + "- \"deux heures de l'après-midi\" -> `14 h` = `2:00 p.m.`\n", + "\n", + "Only a twelve hour system is used when these suffixes accompany the time. As such, our Classifier will need to either adjust the times like in the case of counting backwards or must pass the information to the Verbalizer so it can adjust. \n", + "\n", + "Since our Classifier is long enough as is, we will simply store this information in the `suffix` property and allow the Verbalizer to manage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OqVa78zRgJw9" + }, + "outputs": [], + "source": [ + "graph_suffix_am = pynini.cross(\"du matin\", \"am\")\n", + "graph_suffix_pm = pynini.string_map([(\"de l'après-midi\", \"pm\"),(\"du soir\", \"pm\")])\n", + "\n", + "graph_suffix = pynini.cross(graph_suffix_am, \"am\") | pynini.cross(graph_suffix_pm, \"pm\")\n", + "\n", + "graph_suffix_component = pynutil.insert(\" suffix: \\\"\") + graph_suffix + pynutil.insert(\"\\\"\")\n", + "graph_suffix_component = delete_space + graph_suffix_component\n", + "graph_suffix_component = pynini.closure(graph_suffix_component, 0, 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-LaJMIjUf1XR" + }, + "source": [ + "And we append to our graph:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "76myCFiggX3E" + }, + "outputs": [], + "source": [ + "class TimeFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"time\", kind=\"classify\")\n", + " \"\"\"grammar omitted for length\n", + " ....\n", + " ....\n", + " ....\n", + " \"\"\"\n", + " graph_hours_component = pynini.union(hours, graph_midi, graph_minuit)\n", + " graph_hours_component = pynutil.insert(\"hours: \\\"\") + graph_hours_component + pynutil.insert(\"\\\"\")\n", + "\n", + " graph_minutes_component = (\n", + " pynutil.insert(\" minutes: \\\"\") + pynini.union(minutes, graph_fractions) + pynutil.insert(\"\\\"\")\n", + " ) \n", + " graph_minutes_component = delete_space + graph_minutes_component\n", + "\n", + " graph_time_standard = (graph_hours_component + delete_space + graph_heures \n", + " + pynini.closure(graph_minutes_component, 0, 1))\n", + "\n", + " graph_hours_to_component = hours | graph_midi | graph_minuit\n", + " graph_hours_to_component @= hours_to\n", + " graph_hours_to_component = pynutil.insert(\"hours: \\\"\") + graph_hours_to_component + pynutil.insert(\"\\\"\")\n", + " graph_hours_to_component = graph_hours_to_component + delete_space + graph_heures\n", + "\n", + " graph_minutes_to_component = (minutes | graph_demi | # No 'et' in fractions\n", + " (pynutil.delete(\"le \") + graph_quart) | graph_trois_quart)\n", + " graph_minutes_to_component @= minutes_to\n", + " graph_minutes_to_component = pynutil.insert(\" minutes: \\\"\") + graph_minutes_to_component + pynutil.insert(\"\\\"\")\n", + "\n", + " graph_time_to = graph_hours_to_component + delete_space + graph_moins + delete_space + graph_minutes_to_component\n", + "\n", + " graph_time_no_suffix = graph_time_standard | graph_time_to\n", + "\n", + " graph_suffix_am = pynini.cross(\"du matin\", \"am\")\n", + " graph_suffix_pm = pynini.string_map([(\"de l'après-midi\", \"pm\"),(\"du soir\", \"pm\")])\n", + "\n", + " graph_suffix = pynini.cross(graph_suffix_am, \"am\") | pynini.cross(graph_suffix_pm, \"pm\")\n", + "\n", + " graph_suffix_component = pynutil.insert(\" suffix: \\\"\") + graph_suffix + pynutil.insert(\"\\\"\")\n", + " graph_suffix_component = delete_space + graph_suffix_component\n", + " graph_suffix_component = pynini.closure(graph_suffix_component, 0, 1)\n", + " \n", + " final_graph = graph_time_no_suffix + graph_suffix_component\n", + "\n", + " final_graph = self.add_tokens(final_graph)\n", + "\n", + " self.fst = final_graph.optimize()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see how we did:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "time = TimeFst().fst\n", + "example = \"quatre heures moins cinq\"\n", + "apply_fst(example, time)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lPlJ1qyeAWOL" + }, + "source": [ + "## Verbalizer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CrO-xtJ87PEl" + }, + "source": [ + "The initial part of the Verbalizer should appear familiar. We delete the property tags `hours` and `minutes`, making sure they preserve the actual values for formatting." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fCzZKR7ek0Mz" + }, + "outputs": [], + "source": [ + "hour = (\n", + " pynutil.delete(\"hours:\")\n", + " + delete_space\n", + " + pynutil.delete(\"\\\"\")\n", + " + pynini.closure(NEMO_DIGIT, 1, 2)\n", + " + pynutil.delete(\"\\\"\")\n", + ")\n", + "minute = (\n", + " pynutil.delete(\"minutes:\")\n", + " + delete_extra_space\n", + " + pynutil.delete(\"\\\"\")\n", + " + pynini.closure(NEMO_DIGIT, 1, 2)\n", + " + pynutil.delete(\"\\\"\")\n", + ")\n", + "graph = hour + delete_extra_space + pynutil.insert(\"h\") + minute.ques" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WnVV9GUKk-b7" + }, + "source": [ + "We then deal with the case of `suffix`. We first note that if the suffix is for a morning time (before noon), then there is no further conversion that is needed. We may simply delete the property and its value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "haOEiSbglc6s" + }, + "outputs": [], + "source": [ + "day_suffixes = pynutil.delete(\"suffix: \\\"am\\\"\")\n", + "\n", + "graph = hours + delete_extra_space + pynutil.insert(\"h\") + minute.ques + delete_space + day_suffixes.ques" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wL0FNg6Xlhb-" + }, + "source": [ + "Meanwhile, the post-noon suffixes would require us shifting the hours value by twelve. Much like in the case of counting backwards from the hour, we can simply create a WFST to do this addition work for us." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YLrabUNplwG7" + }, + "outputs": [], + "source": [ + "hour_to_night = pynini.string_map([\n", + " (\"1\", \"13\"),\n", + " (\"2\", \"14\"),\n", + " (\"3\", \"15\"),\n", + " (\"4\", \"16\"),\n", + " (\"5\", \"17\"),\n", + " (\"6\", \"18\"),\n", + " (\"7\", \"19\"),\n", + " (\"8\", \"20\"),\n", + " (\"9\", \"21\"),\n", + " (\"10\", \"22\"),\n", + " (\"11\", \"23\"), # Note that 12 and 24 would be phrased \"midi\" and \"minuit\" respectively\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "X0-z-qJAmIiI" + }, + "source": [ + "We then create an alternate graph where this conversion is mapped onto the hours function - given a post-noon suffix - and create a union with our earlier graph:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8CdEmo9NmN7u" + }, + "outputs": [], + "source": [ + "night_suffixes = pynutil.delete(\"suffix: \\\"pm\\\"\")\n", + "graph |= (\n", + " hour @ hour_to_night\n", + " + delete_extra_space\n", + " + pynutil.insert(\"h\")\n", + " + minute.ques\n", + " + delete_space\n", + " + night_suffixes\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YnoIkZBqmaTo" + }, + "source": [ + "Giving us a final Verbalizer of:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZfXimvFBmdDD" + }, + "outputs": [], + "source": [ + "class TimeFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"time\", kind=\"verbalize\")\n", + "\n", + " hour_to_night = pynini.string_map([\n", + " (\"1\", \"13\"),\n", + " (\"2\", \"14\"),\n", + " (\"3\", \"15\"),\n", + " (\"4\", \"16\"),\n", + " (\"5\", \"17\"),\n", + " (\"6\", \"18\"),\n", + " (\"7\", \"19\"),\n", + " (\"8\", \"20\"),\n", + " (\"9\", \"21\"),\n", + " (\"10\", \"22\"),\n", + " (\"11\", \"23\"),\n", + "])\n", + "\n", + " day_suffixes = pynutil.delete(\"suffix: \\\"am\\\"\")\n", + " night_suffixes = pynutil.delete(\"suffix: \\\"pm\\\"\")\n", + "\n", + " hour = (\n", + " pynutil.delete(\"hours:\")\n", + " + delete_space\n", + " + pynutil.delete(\"\\\"\")\n", + " + pynini.closure(NEMO_DIGIT, 1, 2)\n", + " + pynutil.delete(\"\\\"\")\n", + " )\n", + " minute = (\n", + " pynutil.delete(\"minutes:\")\n", + " + delete_extra_space\n", + " + pynutil.delete(\"\\\"\")\n", + " + pynini.closure(NEMO_DIGIT, 1, 2)\n", + " + pynutil.delete(\"\\\"\")\n", + " )\n", + "\n", + " graph = hour + delete_extra_space + pynutil.insert(\"h\") + minute.ques + delete_space + day_suffixes.ques\n", + "\n", + " graph |= (\n", + " hour @ hour_to_night\n", + " + delete_extra_space\n", + " + pynutil.insert(\"h\")\n", + " + minute.ques\n", + " + delete_space\n", + " + night_suffixes\n", + " )\n", + " delete_tokens = self.delete_tokens(graph)\n", + " self.fst = delete_tokens.optimize()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e5tPcCaSYuhY" + }, + "source": [ + "If you've noticed, the Verbalizer process has become simpler as we've progressed through our WFSTs. Commonly, you will seldom need to even provide the amount of overhead we've seen in `TimeFst`, `MoneyFst`, and `OrdinalFst`, and the majority of this component is simply removing tokens as an intermediary step, as we'll see for our Name class." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iHmRe3UIhyIH" + }, + "source": [ + "# WhiteList WFST " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8kMn2qB9bVFy" + }, + "source": [ + "\n", + "While developing your grammars, you may encounter tokens that refuse standard categorization and yet still require normalization. For example, you may need to render \"Mister Brown\" as `Mr. Brown` or \"H M S Nelson\" as `H.M.S. Nelson`. As these cases are rather specific, they lack a regular pattern for a specific classifier. (What about \"mister\" as a token requires tokenization as opposed to \"Brown\".) Instead, we need to explicitly list their input-output mappings (i.e. a whitelist).\n", + "\n", + "For NeMo, this is performed through the `WhiteListFst`:\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6B4oPXYcccWs" + }, + "source": [ + "## Grammar" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RThTLUCRceOO" + }, + "source": [ + "`WhitelistFst` is essentially just a wrapper for a `string_map` or `string_file` mapping with the appropriate formatting for deployment. Per our example, we can make a graph with the following:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eIOOb_wJdMMx" + }, + "outputs": [], + "source": [ + "graph = pynini.string_map([\n", + " (\"mister\", \"mr.\"),\n", + " (\"h m s\", \"h.m.s\"),\n", + " (\"doctor\", \"dr.\")\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "O5kTXwmPZ9Tt" + }, + "source": [ + "As previously mentioned, here is where the use of `string_file` will make maintenance much easier. Discovering whitelist mappings is an iterative process and you will more than likely need to return to your list throughout development. For instance, it may be obvious that tokens such as \"madame\", \"miss\", \"esquire\", but would you think of providing abbreviations for \"the right honorable\" or \"tennessee valley authority\"? Keeping a tsv file available for quick insertions greatly assists here." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RC5Cf-Z8dYVk" + }, + "source": [ + "## Classifier" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "144nvAHEdfBJ" + }, + "source": [ + "Unlike for our other WFSTs, There is no specific semiotic class for `WhiteListFst`. It instead falls under the default Name class to designate there is no need for further processing beyond obligatory tokenization. Indeed, we can simply insert the token ourselves instead of calling `add_tokens`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oPkrmg2gdznd" + }, + "outputs": [], + "source": [ + "class WhiteListFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"whitelist\", kind=\"classify\")\n", + "\n", + " whitelist = pynini.string_map([\n", + " (\"mister\", \"mr.\"),\n", + " (\"h m s\", \"h.m.s\"),\n", + " (\"doctor\", \"dr.\")])\n", + " graph = pynutil.insert(\"name: \\\"\") + convert_space(whitelist) + pynutil.insert(\"\\\"\")\n", + " self.fst = graph.optimize()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "B05kdSIdd2dv" + }, + "source": [ + "## Verbalizer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since the whitelisted token has already been rendered in the desired normalized form, all that is necessary is to strip the `name` token and render the string 'as is'. This can be done by through the following:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gaq3voIYiUCA" + }, + "outputs": [], + "source": [ + "class WhiteListFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"whitelist\", kind=\"verbalize\")\n", + " graph = (\n", + " pynutil.delete(\"name:\")\n", + " + delete_space\n", + " + pynutil.delete(\"\\\"\")\n", + " + pynini.closure(NEMO_CHAR - \" \", 1)\n", + " + pynutil.delete(\"\\\"\")\n", + " )\n", + " graph = graph @ pynini.cdrewrite(pynini.cross(u\"\\u00A0\", \" \"), \"\", \"\", NEMO_SIGMA) # Removes possible null token\n", + " self.fst = graph.optimize()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cUE7Gg35bWKb" + }, + "source": [ + "While the graph is largely self-explanatory, take note that the default implementation assumes a character string without spacing. If you intend to include additional formatting in your normalization (e.g. `H. M. S.` instead of `H.M.S.`), you may need to adjust the graph to expand coverage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_o_a15Fg7niv" + }, + "source": [ + "# Word and Punctuation WFST " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Zi6lP7mTmnUV" + }, + "source": [ + "Continuing with the Name class, we will conclude with the Word and Punctuation WFSTs. These are among the simplest and most crucial classes of the entire ITN system, as they classify all tokens that are not caught by other semiotic classes. Since these other tokens make up the majority of all strings your normalization system will encounter, they are essential for general functionality.\n", + "\n", + "However, they escape discussion as their function is self-evident: since they function as default classes, tokens only reach Word WFST and Punctuation WFST if they have not been accepted by the other WFSTs. As such, we can simply accept the tokens as they are, providing them a `name` tag." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9zCqczLqp5NW" + }, + "source": [ + "## Classifier" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eUWum5U0p99c" + }, + "source": [ + "For instance, consider the entire `WordFst` Classifier in its entirety:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CCZSTeDHofDl" + }, + "outputs": [], + "source": [ + "class WordFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"word\", kind=\"classify\")\n", + " word = pynutil.insert(\"name: \\\"\") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert(\"\\\"\")\n", + " self.fst = word.optimize()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9ys2VpjjoiEC" + }, + "source": [ + "It just processes the entire token string with the `NEMO_NOT_SPACE` utility WFST (which accepts any string that is not a space). For your language, you may simply use one of the preexisting `WordFst`.\n", + "\n", + "Depending on language, the `PunctuationFst` may require some (minimal) adjustment. Note the following:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mnnd3PVMpF4t" + }, + "outputs": [], + "source": [ + "class PunctuationFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"punctuation\", kind=\"classify\")\n", + "\n", + " s = \"!#$%&\\'()*+,-./:;<=>?@^_`{|}~\"\n", + " punct = pynini.union(*s)\n", + "\n", + " graph = pynutil.insert(\"name: \\\"\") + punct + pynutil.insert(\"\\\"\")\n", + "\n", + " self.fst = graph.optimize()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_afW02LXpLtz" + }, + "source": [ + "If your language uses other punctuation than that in the `s` string (or reserves some of the punctuation as characters), you may simply edit `s` to accommodate. \n", + "\n", + "For instance, French has a unique quotation style that utilizes guillemets \"« »\". We may add their Unicode codepoints (to avoid encoding issues) to `s`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mgfZIKzVplVm" + }, + "outputs": [], + "source": [ + "class PunctuationFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"punctuation\", kind=\"classify\")\n", + "\n", + " s = \"!#$%&\\'()*+,-./:;<=>?@^_`{|}~\"\n", + " guillemets = \"\\u00AB\" + \"\\u00BB\" # quotation marks in French.\n", + " s += guillemets\n", + " punct = pynini.union(*s)\n", + "\n", + " graph = pynutil.insert(\"name: \\\"\") + punct + pynutil.insert(\"\\\"\")\n", + "\n", + " self.fst = graph.optimize()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6Upb5-wcp_7H" + }, + "source": [ + "## Verbalizer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ufWT1T6GqCCT" + }, + "source": [ + "Note that both `PunctuationFst` and `WordFst` both encode with the `name` property. This leaves no differentiation between the two for a Verbalizer. This makes sense as there are no particular formatting rules for them, they simply need a placeholder tag to avoid alteration between the Classifier and Verbalizer step. Once passed to the verbalizer, they are rendered as normal by simply removing the tag (this is practically identical to the WhiteListFST):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LqyhqQKZqcph" + }, + "outputs": [], + "source": [ + "class WordFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"word\", kind=\"verbalize\")\n", + " chars = pynini.closure(NEMO_CHAR - \" \", 1)\n", + " char = pynutil.delete(\"name:\") + delete_space + pynutil.delete(\"\\\"\") + chars + pynutil.delete(\"\\\"\")\n", + " graph = char @ pynini.cdrewrite(pynini.cross(u\"\\u00A0\", \" \"), \"\", \"\", NEMO_SIGMA) # Cleans up possible null character\n", + "\n", + " self.fst = graph.optimize()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lGbrUkcpapyi" + }, + "source": [ + "For many languages, the writing of your `WordFst` and `PunctuationFst` (both Classifiers and Verbalizers) will require no more than duplicating the preexisting grammars found in NeMo Text Processing." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5y9jhkhQ7p4W" + }, + "source": [ + "# Other Classes " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "j1mgnISmiu-g" + }, + "source": [ + "While the preceding discussion should be suitable for development of the remaining classes, some helpful notes may be of use before continuing:\n", + "- Fraction WFST: This is the last of the 'fundamental' classes and should take priority after completion of the Decimal WFST. It operates very similarly to the Ordinal WFST in that you wish to recover the Cardinal roots for the numerator and denominator prior to tagging. Its properties are: `negative`, `integer_part`, `numerator`, and `denominator`.\n", + "- Measure WFST: Like the Money WFST, this will require management of several 'parent' WFSTS (Fraction, Cardinal, Decimal) to be suitably comprehensive. As well, you may find it more productive to find ways to compose new measurement units instead of simply listing all (e.g. micrometers, petameters, miles per hour, feet per second). Its properties are: `negative`, `units` and it allows subgraphs of the `cardinal`, `decimal`, and `fraction` classes. (This is, it allows tokenization within the tokenization.)\n", + "- Date WFST: Depending on writing conventions, this may vary in complexity. For instance, English speakers may write dates as `01/01/2021/` or `Jan. 1 2021`. Are there specific use cases where one is preferred or should you simply decide on a format? Further, you may wish to take advantage of the `preserve order` property to avoid possible unwanted verbalizations (some implementations will permit both `Jan. 1` and `1 Jan.` if not careful.) Its properties are: `month`, `day`, and `year`. \n", + "- Telephone WFST: These will be heavily dependent not only on writing conventions but even regional preference. For instance, the U.S. commonly uses a ten digit system broken into the following sequence: `###-###-####`. Meanwhile, mainland France breaks a ten digit sequence into groups of two: `##-##-##-##-##`. Take careful note of how your language's target region verbalizes these figures and leave room for some variation in development. The `telephone` class has only one property: `number_part`. \n", + "- Electronic WFST: For normalizing email addresses or urls, you will need to develop for the `electronic` class. The main concerns will be managing alphanumeric strings and parsing the reserved symbols used for protocols and domains. (How does your target language pronounce \"https://\"? www? '.' or '@'?\") Depending on whether you are normalizing a url or email, the following properties will be needed:\n", + " - email: `username`, `domain`\n", + " - url: `protocol` (Sparrowhawk allows further detail here but NeMo passes the entire url through the `protocol` property)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-i25X8mK90n3" + }, + "source": [ + "# Tokenize and Classify " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v4bcigU6b9ss" + }, + "source": [ + "We are now ready to build a general Classifier for our entire language. Upon completion of your grammars, the next step is to unite them together in a general Classifier WFST - located within a `tokenize_and_classify.py` file, preferably. This WFST will be responsible for determining the appropriate semiotic class for each token in your string and processing the necessary properties for normalization.\n", + "\n", + "For this section, we will focus on the following: grammar composition, assignment of weights, and importing/exporting as a FAR file. Since we will need to work with some instantiated graphs, let's preload them before proceeding. (Note the compilingtime.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_text_processing.inverse_text_normalization.fr.taggers.cardinal import CardinalFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.taggers.decimal import DecimalFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.taggers.money import MoneyFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.taggers.ordinal import OrdinalFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.taggers.punctuation import PunctuationFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.taggers.time import TimeFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.taggers.whitelist import WhiteListFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.taggers.word import WordFst\n", + "\n", + "cardinal = CardinalFst()\n", + "cardinal_graph = cardinal.fst\n", + "\n", + "ordinal = OrdinalFst(cardinal)\n", + "ordinal_graph = ordinal.fst\n", + "\n", + "decimal = DecimalFst(cardinal)\n", + "decimal_graph = decimal.fst\n", + "\n", + "whitelist_graph = WhiteListFst().fst\n", + "word_graph = WordFst().fst\n", + "time_graph = TimeFst().fst\n", + "money_graph = MoneyFst(cardinal, decimal).fst\n", + "punct_graph = PunctuationFst().fst" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MIv58eSocOV1" + }, + "source": [ + "## Grammar" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k_RPlnfVdG5E" + }, + "source": [ + "As for all previous grammars, the `tokenize_and_classify` grammar inherits from a `GraphFst` as an individual class: `ClassifyFst`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WHKG4c2WdW0G" + }, + "outputs": [], + "source": [ + "class ClassifyFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "j9_I6DJmdcOG" + }, + "source": [ + "This class is responsible for instantiating all subgraphs and passing necessary dependencies:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4YtmcxLOdlas" + }, + "outputs": [], + "source": [ + "class ClassifyFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")\n", + "\n", + " cardinal = CardinalFst()\n", + " cardinal_graph = cardinal.fst\n", + "\n", + " ordinal = OrdinalFst(cardinal)\n", + " ordinal_graph = ordinal.fst\n", + "\n", + " decimal = DecimalFst(cardinal)\n", + " decimal_graph = decimal.fst\n", + "\n", + " whitelist_graph = WhiteList().fst\n", + " word_graph = WordFst().fst\n", + " time_graph = TimeFst().fst\n", + " money_graph = MoneyFst(cardinal, decimal).fst\n", + " punct_graph = PunctuationFst().fst" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y5vGvv3HeAY9" + }, + "source": [ + "We then join all the grammars together so `ClassifyFst` can apply them. Rather unceremoniously, this is accomplished by performing a union across all grammars (excluding `PunctuationFst`, to assist tokenization). We then follow this union by inserting the `tokens` class around the resulting formatting (required for processing):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oocgPQ5geZJO" + }, + "outputs": [], + "source": [ + "class ClassifyFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")\n", + "\n", + " cardinal = CardinalFst()\n", + " cardinal_graph = cardinal.fst\n", + "\n", + " ordinal = OrdinalFst(cardinal)\n", + " ordinal_graph = ordinal.fst\n", + "\n", + " decimal = DecimalFst(cardinal)\n", + " decimal_graph = decimal.fst\n", + "\n", + " whitelist_graph = WhiteListFst().fst\n", + " word_graph = WordFst().fst\n", + " time_graph = TimeFst().fst\n", + " money_graph = MoneyFst(cardinal, decimal).fst\n", + " punct_graph = PunctuationFst().fst\n", + "\n", + " classify = (\n", + " time_graph\n", + " | whitelist_graph\n", + " | decimal_graph\n", + " | cardinal_graph\n", + " | ordinal_graph\n", + " | money_graph\n", + " | word_graph\n", + " )\n", + " token = pynutil.insert(\"tokens { \") + classify + pynutil.insert(\" }\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ASWDXWQjfLEU" + }, + "source": [ + "Our graph is now able to process an individual token. But what about a string? Here you will need to be mindful of the tokenization behavior for your language and decide on your desired treatment of punctuation (hence exclusion from the main graph). \n", + "\n", + "For our purposes, we will assume the convention of space and punctuation serving as token separators. We graph punctuation as individual tokens" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "r6WztK2jwhFt" + }, + "outputs": [], + "source": [ + "punct_graph = PunctuationFst().fst\n", + "punct = pynutil.insert(\"tokens { \") + punct_graph + pynutil.insert(\" }\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9T2rT89jw3T1" + }, + "source": [ + "and join the `punct` graph with our `tokens` graph (inserting spaces between tokens for formatting)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rGtVOK-txKOP" + }, + "outputs": [], + "source": [ + "token = \"PLACEHOLDER\"\n", + "token_plus_punct = (\n", + " pynini.closure(punct + pynutil.insert(\" \")) + token + pynini.closure(pynutil.insert(\" \") + punct)\n", + " ) # Note the use of closure incase there are multiple punctuations\n", + "graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_gixfQ69xWPe" + }, + "source": [ + "then address space between tokens: \n", + "\n", + "`graph = delete_space + graph + delete_space`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DWnmazWecyUG" + }, + "source": [ + "## Weighting " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "egHbwIbMx-hT" + }, + "source": [ + "Were we to leave our `ClassifyFst` like this, we would undoubtedly encounter a mountain of errors. What will stop our graph from treating punctuation that is part of a previous grammar as a token separator (e.g. \"vingt-et-un\")? How do we ensure that a currency string isn't treated as solely a decimal string with a `name` token following?\n", + "\n", + "As in previous cases, the solution lies in our choice of weights for the grammar." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y3U7_M8CyxZ1" + }, + "source": [ + "Let us return to the main graph:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9VXe1dfsy3Be" + }, + "outputs": [], + "source": [ + "classify = (\n", + " time_graph\n", + " | whitelist_graph\n", + " | decimal_graph\n", + " | cardinal_graph\n", + " | ordinal_graph\n", + " | money_graph\n", + " | word_graph\n", + " )\n", + "punct = pynutil.insert(\"tokens { \") + punct_graph + pynutil.insert(\" }\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aY4vOFqxy5ua" + }, + "source": [ + "Beyond the path weights that we explicitly added, these graphs are currently weightless. Since we want the graphs themselves to be the general determiners of a path, let us use some default weights an order of magnitude beyond our path weights (we use `pynutil.add_weight`):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bthyt_Le2rsA" + }, + "outputs": [], + "source": [ + "classify = (\n", + " pynutil.add_weight(time_graph, 1)\n", + " | pynutil.add_weight(whitelist_graph, 1)\n", + " | pynutil.add_weight(decimal_graph, 1)\n", + " | pynutil.add_weight(cardinal_graph, 1)\n", + " | pynutil.add_weight(ordinal_graph, 1)\n", + " | pynutil.add_weight(money_graph, 1)\n", + " | pynutil.add_weight(word_graph, 1)\n", + " )\n", + "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1) + pynutil.insert(\" }\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xMNIJbzj3MMP" + }, + "source": [ + "Let's see what logical adjustments should be made. First off, we know that we want each class token to span the largest string possible. (e.g. We don't want \"quatre-vingt\" to be rendered as two `cardinal` classes with a hyphen in between.) As such, we want to penalize our graph for using more than one tokens. We can do so by establishing the following constraint: the sum of two or more tokens cannot be less than the weight of a single token. Or, for any pair of tokens `w_1` and `w_2`, their sum must always be greater than any other individual token (including themselves):\n", + "\n", + "`w_1 + w_2 > k >= w`\n", + "\n", + "To keep things simple, let us make the upper limit `2`. This means we should increase all the weights to keep our constraint:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "classify = (\n", + " pynutil.add_weight(time_graph, 1.1)\n", + " | pynutil.add_weight(whitelist_graph, 1.1)\n", + " | pynutil.add_weight(decimal_graph, 1.1)\n", + " | pynutil.add_weight(cardinal_graph, 1.1)\n", + " | pynutil.add_weight(ordinal_graph, 1.1)\n", + " | pynutil.add_weight(money_graph, 1.1)\n", + " | pynutil.add_weight(word_graph, 1.1)\n", + " )\n", + "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1.1) + pynutil.insert(\" }\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Do we want this constraint to include all tokens? Imagine if we had a string of multiple semiotic tokens in a row. Since this string's combined weight would be larger than any single class token, a grammar that served as a universal acceptor (i.e. `word_graph`) would be preferred over these individual classes. This would be obviously incorrect. As such, we want to make sure that `word_graph` would only be traversed when there is truly no other option:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qc_CU2ro63eg" + }, + "outputs": [], + "source": [ + "classify = (\n", + " pynutil.add_weight(time_graph, 1.1)\n", + " | pynutil.add_weight(whitelist_graph, 1.1)\n", + " | pynutil.add_weight(decimal_graph, 1.1)\n", + " | pynutil.add_weight(cardinal_graph, 1.1)\n", + " | pynutil.add_weight(ordinal_graph, 1.1)\n", + " | pynutil.add_weight(money_graph, 1.1)\n", + " | pynutil.add_weight(word_graph, 100)\n", + " )\n", + "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1.1) + pynutil.insert(\" }\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, even with a string of fifty different class tokens, `word_graph` would still not be considered as a path to traverse." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fW8C3vD-7Dbl" + }, + "source": [ + "Next, let us consider our foundational graph: `cardinal_graph`. As Cardinals occur in practically all our WFSTs, it's possible for `cardinal_graph` to apply in almost all cases. Yet, we've specifically invoked `CardinalFST` when it was required in any of the other classes, so it will never be needed in any of those cases. This means that we want all those graphs to have *priority* over `cardinal_graph`. As such, we will increase its weight so it takes second lowest precedence (while still paying attention to the combined weight constraint). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "97UwGaEn8pj7" + }, + "outputs": [], + "source": [ + "classify = (\n", + " pynutil.add_weight(time_graph, 1.1)\n", + " | pynutil.add_weight(whitelist_graph, 1.1)\n", + " | pynutil.add_weight(decimal_graph, 1.1)\n", + " | pynutil.add_weight(cardinal_graph, 1.2)\n", + " | pynutil.add_weight(ordinal_graph, 1.1)\n", + " | pynutil.add_weight(money_graph, 1.1)\n", + " | pynutil.add_weight(word_graph, 100)\n", + " )\n", + "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1.1) + pynutil.insert(\" }\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0d9Lw4Ot88_B" + }, + "source": [ + "This form of thinking can be applied to all the 'foundational' graphs you may develop: the dependent graphs should take higher precedence than the graphs they borrow from. For instance, since `money_graph` utilizes `decimal_graph`, we know it should take precedence. However, since `decimal_graph` borrows from `cardinal_graph`, its weight must still be less than `1.2`. As such: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-wF8cgLK9tpU" + }, + "outputs": [], + "source": [ + "classify = (\n", + " pynutil.add_weight(time_graph, 1)\n", + " | pynutil.add_weight(whitelist_graph, 1)\n", + " | pynutil.add_weight(decimal_graph, 1.1)\n", + " | pynutil.add_weight(cardinal_graph, 1.2)\n", + " | pynutil.add_weight(ordinal_graph, 1)\n", + " | pynutil.add_weight(money_graph, 1.09)\n", + " | pynutil.add_weight(word_graph, 100)\n", + " )\n", + "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1) + pynutil.insert(\" }\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "huMzDoZ2-FD2" + }, + "source": [ + "For those classes that don't seem affected, we can set their weights as the same as those below their 'foundation' graphs, simply to prevent prioritization when not required\n", + "\n", + "Meanwhile, `whitelist_graph` should take precedence over all others, as it may contain unique normalizations that may get accidentally caught by the other graphs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gWG6ttyd-bbD" + }, + "outputs": [], + "source": [ + "classify = (\n", + " pynutil.add_weight(time_graph, 1.1)\n", + " | pynutil.add_weight(whitelist_graph, 1.07)\n", + " | pynutil.add_weight(decimal_graph, 1.1)\n", + " | pynutil.add_weight(cardinal_graph, 1.2)\n", + " | pynutil.add_weight(ordinal_graph, 1.1)\n", + " | pynutil.add_weight(money_graph, 1.08)\n", + " | pynutil.add_weight(word_graph, 100)\n", + " )\n", + "punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, 1.1) + pynutil.insert(\" }\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1TH08f8O-fWx" + }, + "source": [ + "Keep in mind that building weights in this manner is hardly a rule for grammar development and is instead intended as a means to initialize weights for empirical development. You will find that actual strings will cause unexpected behavior that require fine tuning. \n", + "\n", + "For instance, the Classifier for French in NeMo ITN benefits from having varying precedence for some weights, as seen in the following excerpt:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gKdkyDK3_r46" + }, + "outputs": [], + "source": [ + "class ClassifyFst(GraphFst):\n", + " \"\"\"\n", + " Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.\n", + " For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. \n", + " More details to deployment at NeMo/tools/text_processing_deployment.\n", + "\n", + " Args:\n", + " cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.\n", + " overwrite_cache: set to True to overwrite .far files\n", + " \"\"\"\n", + "\n", + " def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):\n", + " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")\n", + "\n", + " far_file = None\n", + " if cache_dir is not None and cache_dir != \"None\":\n", + " os.makedirs(cache_dir, exist_ok=True)\n", + " far_file = os.path.join(cache_dir, \"_fr_itn.far\")\n", + " if not overwrite_cache and far_file and os.path.exists(far_file):\n", + " self.fst = pynini.Far(far_file, mode=\"r\")[\"tokenize_and_classify\"]\n", + " logging.info(f\"ClassifyFst.fst was restored from {far_file}.\")\n", + " else:\n", + " logging.info(f\"Creating ClassifyFst grammars.\")\n", + "\n", + " cardinal = CardinalFst()\n", + " cardinal_graph = cardinal.fst\n", + "\n", + " fraction = FractionFst(cardinal)\n", + " fraction_graph = fraction.fst\n", + "\n", + " ordinal = OrdinalFst(cardinal)\n", + " ordinal_graph = ordinal.fst\n", + "\n", + " decimal = DecimalFst(cardinal)\n", + " decimal_graph = decimal.fst\n", + "\n", + " measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction).fst\n", + " date_graph = DateFst(cardinal).fst\n", + " word_graph = WordFst().fst\n", + " time_graph = TimeFst().fst\n", + " money_graph = MoneyFst(cardinal, decimal).fst\n", + " whitelist_graph = WhiteListFst().fst\n", + " punct_graph = PunctuationFst().fst\n", + " electronic_graph = ElectronicFst().fst\n", + " telephone_graph = TelephoneFst().fst\n", + "\n", + " classify = (\n", + " pynutil.add_weight(whitelist_graph, 1.01)\n", + " | pynutil.add_weight(time_graph, 1.05)\n", + " | pynutil.add_weight(date_graph, 1.09)\n", + " | pynutil.add_weight(decimal_graph, 1.08)\n", + " | pynutil.add_weight(measure_graph, 1.1)\n", + " | pynutil.add_weight(cardinal_graph, 1.1)\n", + " | pynutil.add_weight(ordinal_graph, 1.1)\n", + " | pynutil.add_weight(fraction_graph, 1.09)\n", + " | pynutil.add_weight(money_graph, 1.07)\n", + " | pynutil.add_weight(telephone_graph, 1.1)\n", + " | pynutil.add_weight(electronic_graph, 1.1)\n", + " | pynutil.add_weight(word_graph, 100)\n", + " )\n", + "\n", + " punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(\" }\")\n", + " token = pynutil.insert(\"tokens { \") + classify + pynutil.insert(\" }\")\n", + " token_plus_punct = (\n", + " pynini.closure(punct + pynutil.insert(\" \")) + token + pynini.closure(pynutil.insert(\" \") + punct)\n", + " )\n", + "\n", + " graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)\n", + " graph = delete_space + graph + delete_space\n", + "\n", + " self.fst = graph.optimize()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qc4B_0rNcQZu" + }, + "source": [ + "## FAR import/export" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0nRRPvy-AYsA" + }, + "source": [ + "While working through these code excerpts, you may have noticed some latency with each instantiation of our WFSTs (notably wherever `CardinalFst` was involved). This is because the `pynini.optimize` that we call with each graph's instantiation is computationally expensive. For our ultimate purpose of deployment, it seems a waste of resources to recreate stable graphs for each use.\n", + "\n", + "To address this, NeMo ITN supports WFST caching through use of `pynini.Far`, storing and recovering Classify grammars as FAR (Fst ARchives).\n", + "\n", + "Let us update our `ClassifyFst` to permit passing a cache and allowing overwriting (for development):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5XgWevUzD1AE" + }, + "outputs": [], + "source": [ + "class ClassifyFst(GraphFst):\n", + " def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):\n", + " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l28GMR70ESz0" + }, + "source": [ + "For storing our graphs as FARs, we can use `graph_utils.generator_main`, which saves our WFSTs by type for easier management. For arguments it takes a string name and a dict mapping of WFST type to graph:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "AzTkcmAWFLYm" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "class ClassifyFst(GraphFst):\n", + " def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):\n", + " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")\n", + " # Grammar here\n", + " # ....\n", + " if cache_dir is not None and cache_dir != \"None\":\n", + " os.makedirs(cache_dir, exist_ok=True)\n", + " far_file = os.path.join(cache_dir, \"_fr_itn.far\")\n", + " generator_main(far_file, {\"tokenize_and_classify\": self.fst})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Wz8wjCQSD6eJ" + }, + "source": [ + "We pair this with the ability to load from cache (note the `\"tokenize_and_classify\"` key being passed):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FRFYgMmuD_53" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "class ClassifyFst(GraphFst):\n", + " def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):\n", + " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")\n", + " if not overwrite_cache and far_file and os.path.exists(far_file):\n", + " self.fst = pynini.Far(far_file, mode=\"r\")[\"tokenize_and_classify\"]\n", + " else:\n", + " # Grammar here\n", + " # ....\n", + " if cache_dir is not None and cache_dir != \"None\":\n", + " os.makedirs(cache_dir, exist_ok=True)\n", + " far_file = os.path.join(cache_dir, \"_fr_itn.far\")\n", + " generator_main(far_file, {\"tokenize_and_classify\": self.fst})\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ib9nggZxF38s" + }, + "source": [ + "Producing our `ClassifyFst` as:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "d2BZyx6sGGg2" + }, + "outputs": [], + "source": [ + "class ClassifyFst(GraphFst):\n", + " def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):\n", + " super().__init__(name=\"tokenize_and_classify\", kind=\"classify\")\n", + "\n", + " far_file = None\n", + " if cache_dir is not None and cache_dir != \"None\":\n", + " os.makedirs(cache_dir, exist_ok=True)\n", + " far_file = os.path.join(cache_dir, \"_fr_itn.far\")\n", + " if not overwrite_cache and far_file and os.path.exists(far_file):\n", + " self.fst = pynini.Far(far_file, mode=\"r\")[\"tokenize_and_classify\"]\n", + " else:\n", + " cardinal = CardinalFst()\n", + " cardinal_graph = cardinal.fst\n", + "\n", + " ordinal = OrdinalFst(cardinal)\n", + " ordinal_graph = ordinal.fst\n", + "\n", + " decimal = DecimalFst(cardinal)\n", + " decimal_graph = decimal.fst\n", + "\n", + " whitelist_graph = WhiteList().fst\n", + " word_graph = WordFst().fst\n", + " time_graph = TimeFst().fst\n", + " money_graph = MoneyFst(cardinal, decimal).fst\n", + " whitelist_graph = WhiteListFst().fst\n", + " punct_graph = PunctuationFst().fst\n", + "\n", + " classify = (\n", + " pynutil.add_weight(time_graph, 1.1)\n", + " | pynutil.add_weight(whitelist_graph, 1.01)\n", + " | pynutil.add_weight(decimal_graph, 1.09)\n", + " | pynutil.add_weight(cardinal_graph, 1.1)\n", + " | pynutil.add_weight(ordinal_graph, 1.09)\n", + " | pynutil.add_weight(money_graph, 1.08)\n", + " | pynutil.add_weight(word_graph, 100)\n", + " )\n", + "\n", + " punct = pynutil.insert(\"tokens { \") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(\" }\")\n", + " token = pynutil.insert(\"tokens { \") + classify + pynutil.insert(\" }\")\n", + " token_plus_punct = (\n", + " pynini.closure(punct + pynutil.insert(\" \")) + token + pynini.closure(pynutil.insert(\" \") + punct)\n", + " )\n", + "\n", + " graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)\n", + " graph = delete_space + graph + delete_space\n", + "\n", + " self.fst = graph.optimize()\n", + "\n", + " if far_file:\n", + " generator_main(far_file, {\"tokenize_and_classify\": self.fst})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nEhY6wKKtfhn" + }, + "source": [ + "You should find the caching to vastly speed up compilingtime." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rTtCnC5w95CI" + }, + "source": [ + "# Verbalize and Verbalize Final " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "H9y5yuk1HaGj" + }, + "source": [ + "Our last step is to create a universal Verbalizer for all classes. This is very similar to development of `ClassifierFst`, except that the Verbalizer breaks its normalization task into two components:\n", + "- `VerbalizeFst`, which removes formatting for each token\n", + "- `VerbalizeFinalFst`, which extends `VerbalizeFst` across all tokens in a string\n", + "Why two componenets when `tokenize_and_classify` was one? Because Sparrowhawk performs all the functionality of `VerbalizeFinalFst`, so its inclusion would break deployment. However, without it, your NeMo grammar would be unable to function at base. So we separate the two to allow the best of both world." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vUawTJVuH8iR" + }, + "source": [ + "## VerbalizeFst" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xghiBV06IIWU" + }, + "source": [ + "Much like `ClassifyFst`, `VerbalizeFst` instantiates all its subgraphs and then joins them together under a union operation. However, it does not need to employ weighting. Why? Because `ClassifyFst` has assigned each token a specific class. As each class is unique, there is no possibility that a subgraph will be employed for the wrong token.\n", + "\n", + "As such, our `VerbalizeFst` is formed by a simple union operation across all previous Verbalizer graphs:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uMVCqCvsIt2v" + }, + "outputs": [], + "source": [ + "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.cardinal import CardinalFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.decimal import DecimalFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.money import MoneyFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.ordinal import OrdinalFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.time import TimeFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.whitelist import WhiteListFst\n", + "from nemo_text_processing.inverse_text_normalization.fr.verbalizers.word import WordFst\n", + "\n", + "class VerbalizeFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"verbalize\", kind=\"verbalize\")\n", + " cardinal = CardinalFst()\n", + " cardinal_graph = cardinal.fst\n", + " ordinal_graph = OrdinalFst().fst\n", + " decimal = DecimalFst()\n", + " decimal_graph = decimal.fst\n", + " whitelist_graph = WhiteListFst().fst\n", + " money_graph = MoneyFst(decimal=decimal).fst\n", + " time_graph = TimeFst().fst\n", + " graph = (\n", + " time_graph\n", + " | whitelist_graph\n", + " | money_graph\n", + " | ordinal_graph\n", + " | decimal_graph\n", + " | cardinal_graph\n", + " )\n", + " self.fst = graph" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Wap-LU6EI2Iu" + }, + "source": [ + "## Verbalize Final" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TYaEt_0tI47t" + }, + "source": [ + "With `VerbalizeFst` complete, we now extend our graph to cover any series of tokens. All this requires is deletion of the `tokens` formatting (note the absence of such in our previous graph) and use of closure for any series of one or more tokens.\n", + "\n", + "This provides the following graph:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "L-9lJNE6JPCW" + }, + "outputs": [], + "source": [ + "\n", + "class VerbalizeFinalFst(GraphFst):\n", + " def __init__(self):\n", + " super().__init__(name=\"verbalize_final\", kind=\"verbalize\")\n", + " verbalize = VerbalizeFst().fst\n", + " word = WordFst().fst\n", + " types = verbalize | word\n", + " graph = (\n", + " pynutil.delete(\"tokens\")\n", + " + delete_space\n", + " + pynutil.delete(\"{\")\n", + " + delete_space\n", + " + types\n", + " + delete_space\n", + " + pynutil.delete(\"}\")\n", + " )\n", + " graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space\n", + " self.fst = graph" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WwMKFw-QJVgm" + }, + "source": [ + "Unlike `ClassifyFst`, NeMo ITN does not cache `VerbalizeFst` or `VerbalizeFinalFst`. (While you are welcome to provide such functionality in your own development, keep in mind that the limited complexity of our Verbalizers makes compilingtimes less significant.)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7U21AZearZMK" + }, + "source": [ + "# Deployment " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VrSccoh9K6JK" + }, + "source": [ + "Now that we have done all the groundwork, we can finally move to deployment. This final section will just cover the minor code alterations required to call your language through NeMo ITN and deploy through Sparrowhawk. For further information on using NeMo ITN, please see [this tutorial](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Inverse_(Text)_Normalization.ipynb). " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0Le2aJvFIAKd" + }, + "source": [ + "## InverseNormalize" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "r2R3TUCDLi5-" + }, + "source": [ + "NeMo calls upon the `InverseNormalizer` class for all ITN tasks. Given a string and language, it will instantiate both the `ClassifierFst` and `VerbalizeFst` respective for the given language. (Note: we do not use `VerbalizeFinal` as its functions are managed by Sparrowhawk.) To make your language deployable in the general NeMo ITN system, you must designate the availability of these classes for instantiation. (For more information, see the [source code](https://github.com/NVIDIA/NeMo/blob/main/nemo_text_processing/inverse_text_normalization/inverse_normalize.py).)\n", + "\n", + "To do so requires only two changes. The first is providing a string to identify your language as an option for `parse_args` ([ISO codes are advised](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)):" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tfv4Ee3ML-Fg" + }, + "source": [ + "```Python\n", + "def parse_args():\n", + " parser = ArgumentParser()\n", + " ...\n", + " parser.add_argument(\"--language\", choices=[..., 'MY_LANGUAGE'], type=str)\n", + " ...\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "awVl5nAsMUTl" + }, + "source": [ + "The next is to call your `ClassifyFst` and `VerbalizeFst` from `__init__`:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```bash\n", + "class InverseNormalizer(Normalizer):\n", + " def __init__(self, lang: str = 'en', cache_dir: str = None, overwrite_cache: bool = False):\n", + "\n", + " if lang == 'en':\n", + " from nemo_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import ClassifyFst\n", + " from nemo_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import (\n", + " VerbalizeFinalFst,\n", + " )\n", + " # Other languages\n", + " # ....\n", + " elif lang == 'MY_LANGUAGE':\n", + "\n", + " from nemo_text_processing.inverse_text_normalization.MY_LANGUAGE.taggers.tokenize_and_classify import ClassifyFst\n", + "\n", + " from nemo_text_processing.inverse_text_normalization.MY_LANGUAGE.verbalizers.verbalize_final import (\n", + "\n", + " VerbalizeFst,\n", + "\n", + " )\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TI1PuejLMxdI" + }, + "source": [ + "And you're done! NeMo will handle the rest. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xrksINQoICfj" + }, + "source": [ + "## Grammar export and Deployment to C++" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rP9-dmMJSg3h" + }, + "source": [ + "Find information here:\n", + "https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/wfst_text_processing_deployment.html" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TDoVUxCE-Dax" + }, + "source": [ + "# Final Notes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Fw-9mU7ql8iY" + }, + "source": [ + "Congratulations, you have now constructed an entire ITN system from the ground up! While your experience will vary with each language, you will find several commonalities that will assist you in further development. \n", + "\n", + "If you are interested in working further with your language WFSTs, you may wish to construct a TN system. Broadly, this is accomplished by inverting your previous graphs (`pynini.invert` may assist here) and changing your outputs to avoid indeterminacy (i.e. decide on one canonical output for your grammar for each class). But outside of such grammar specific edits, you repeat many of the steps exhibited here, such as:\n", + "- Use of a two step classifier-verbalizer system\n", + "- Same semiotic classes for tagging\n", + "- Inheritance of `GraphFst`\n", + "\n", + "For Audio-based non-deterministic text normalization please extend your grammars with additional output options for ambiguous options. Every semiotic class has a input flag `deterministic` which is by default set to True. For non-deterministic text normalization add additional grammar for the case `deterministic=False` \n", + "\n", + "We also recommend to look at the source of some of the existing [languages](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/wfst_text_normalization.html#language-support), in particular English: https://github.com/NVIDIA/NeMo/tree/main/nemo_text_processing/inverse_text_normalization/en." + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "WFST Tutorial.ipynb", + "provenance": [], + "toc_visible": true + }, + "interpreter": { + "hash": "fbc643a332f9d7801191710b24a8a955d342df4f32791f7fb65121dc4784751f" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/tutorials/text_processing/images/audio_based_tn.png b/tutorials/text_processing/images/audio_based_tn.png new file mode 100644 index 000000000000..7953c7ee0eb0 Binary files /dev/null and b/tutorials/text_processing/images/audio_based_tn.png differ diff --git a/tutorials/text_processing/images/task_overview.png b/tutorials/text_processing/images/task_overview.png new file mode 100644 index 000000000000..98aefe0364fa Binary files /dev/null and b/tutorials/text_processing/images/task_overview.png differ