NVIDIA · anand-nv · Feb 27, 2023 · Feb 22, 2023 · Feb 22, 2023 · Feb 22, 2023
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -2,38 +2,27 @@
 
 Add a one line overview of what this PR aims to accomplish.
 
-**Collection**: [Note which collection this PR will affect]
-
-# Changelog 
-- Add specific line by line info of high level changes in this PR.
-
-# Usage
-* You can potentially add a usage example below
-
-```python
-# Add a code snippet demonstrating how to use this 
-```
 
 # Before your PR is "Ready for review"
 **Pre checks**:
-- [ ] Make sure you read and followed [Contributor guidelines](https://github.com/NVIDIA/NeMo/blob/main/CONTRIBUTING.md)
-- [ ] Did you write any new necessary tests?
-- [ ] Did you add or update any necessary documentation?
-- [ ] Does the PR affect components that are optional to install? (Ex: Numba, Pynini, Apex etc)
-  - [ ] Reviewer: Does the PR have correct import guards for all optional libraries?
+- [ ] Have you signed your commits? Use ``git commit -s`` to sign.
+- [ ] Do all unittests finish successfully before sending PR?
+   1) ``pytest`` or (if your machine does not have GPU) ``pytest --cpu`` from the root folder (given you marked your test cases accordingly `@pytest.mark.run_only_on('CPU')`).
+   2) Sparrowhawk tests ``bash tools/text_processing_deployment/export_grammars.sh --MODE=test ...``
+- [ ] If you are adding a new feature: Have you added test cases for both `pytest` and Sparrowhawk [here](tests/nemo_text_processing).
+- [ ] Have you added ``__init__.py`` for every folder and subfolder, including `data` folder which has .TSV files?
+- [ ] Have you followed codeQL results and removed unused variables and imports (report is at the bottom of the PR in github review box) ?
+- [ ] Have you added the correct license header `Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.` to all newly added Python files?
+- [ ] If you copied [nemo_text_processing/text_normalization/en/graph_utils.py](nemo_text_processing/text_normalization/en/graph_utils.py) your header's second line should be `Copyright 2015 and onwards Google, Inc.`. See an example [here](https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/en/graph_utils.py#L2).
+- [ ] Remove import guards (`try import: ... except: ...`) if not already done.
+- [ ] If you added a new language or a new feature please update the [NeMo documentation](https://github.com/NVIDIA/NeMo/blob/main/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst) (lives in different repo).
+- [ ] Have you added your language support to [tools/text_processing_deployment/pynini_export.py](tools/text_processing_deployment/pynini_export.py).
 
+
+
 **PR Type**:
 - [ ] New Feature
 - [ ] Bugfix
 - [ ] Documentation
 
 If you haven't finished some of the above items you can still open "Draft" PR.
-
-
-## Who can review?
-
-Anyone in the community is free to review the PR once the checks have passed. 
-[Contributor guidelines](https://github.com/NVIDIA/NeMo/blob/main/CONTRIBUTING.md) contains specific people who can review PRs to various areas.
-
-# Additional Information
-* Related to # (issue)
diff --git a/README.md b/README.md
@@ -1,8 +1,6 @@
 **NeMo Text Processing**
 ==========================
 
-**This repository is under development, please refer to https://github.com/NVIDIA/NeMo/tree/main/nemo_text_processing for full functionality. See [documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/wfst_text_normalization.html) for details.**
-
 Introduction
 ------------
 

diff --git a/nemo_text_processing/fst_alignment/alignment.cpp b/nemo_text_processing/fst_alignment/alignment.cpp
@@ -30,7 +30,7 @@ typedef StdArcLookAheadFst LookaheadFst;
 // Usage: 
 
 // g++ -std=gnu++11 -I<path to env>/include/ alignment.cpp -lfst -lthrax -ldl -L<path to env>/lib 
-// ./a.out <fst file> "tokenize_and_classify" "2615 Forest Av, 1 Aug 2016" 22 26
+// ./a.out <fst file> "TOKENIZE_AND_CLASSIFY" "2615 Forest Av, 1 Aug 2016" 22 26
 
 // Output:
 // inp string: |2615 Forest Av, 1 Aug 2016|
@@ -42,7 +42,7 @@ typedef StdArcLookAheadFst LookaheadFst;
 // Disclaimer: The heuristic algorithm relies on monotonous alignment and can fail in certain situations,
 // e.g. when word pieces are reordered by the fst, e.g. 
 
-// ./a.out <fst file> "tokenize_and_classify" "$1" 0 1
+// ./a.out <fst file> "TOKENIZE_AND_CLASSIFY" "$1" 0 1
 // inp string: |$1|
 // out string: |one dollar|
 // inp indices: [0:1] out indices: [0:3]

diff --git a/nemo_text_processing/fst_alignment/alignment.py b/nemo_text_processing/fst_alignment/alignment.py
@@ -30,7 +30,7 @@
 
 Usage: 
 
-python alignment.py --fst=<fst file> --text=\"2615 Forest Av, 1 Aug 2016\" --rule=\"tokenize_and_classify\" --start=22 --end=26
+python alignment.py --fst=<fst file> --text="2615 Forest Av, 1 Aug 2016" --rule=TOKENIZE_AND_CLASSIFY --start=22 --end=26 --grammar=TN
 
 Output:
 inp string: |2615 Forest Av, 1 Aug 2016|
@@ -40,7 +40,7 @@
 in: |2016| out: |twenty sixteen|
 
 
-python alignment.py --fst=<fst file> --text=\"2615 Forest Av, 1 Aug 2016\" --rule=\"tokenize_and_classify\"
+python alignment.py --fst=<fst file> --text="2615 Forest Av, 1 Aug 2016" --rule=TOKENIZE_AND_CLASSIFY
 
 Output:
 inp string: |2615 Forest Av, 1 Aug 2016|
@@ -74,6 +74,9 @@
 def parse_args():
     args = ArgumentParser("map substring to output with FST")
     args.add_argument("--fst", help="FAR file containing FST", type=str, required=True)
+    args.add_argument(
+        "--grammar", help="tn or itn", type=str, required=False, choices=[ITN_MODE, TN_MODE], default=TN_MODE
+    )
     args.add_argument(
         "--rule",
         help="rule name in FAR file containing FST",
@@ -94,6 +97,8 @@ def parse_args():
 
 EPS = "<eps>"
 WHITE_SPACE = "\u23B5"
+ITN_MODE = "itn"
+TN_MODE = "tn"
 
 
 def get_word_segments(text: str) -> List[List[int]]:
@@ -142,9 +147,10 @@ def get_string_alignment(fst: pynini.Fst, input_text: str, symbol_table: pynini.
 
     ilabels = paths.ilabels()
     olabels = paths.olabels()
-    logging.debug(paths.istring())
-    logging.debug(paths.ostring())
+    logging.debug("input: " + paths.istring())
+    logging.debug("output: " + paths.ostring())
     output = list(zip([symbol_table.find(x) for x in ilabels], [symbol_table.find(x) for x in olabels]))
+    logging.debug(f"alignment: {output}")
     paths.next()
     assert paths.done()
     output_str = "".join(map(remove, [x[1] for x in output]))
@@ -184,52 +190,57 @@ def _get_original_index(alignment, aligned_index):
 remove = lambda x: "" if x == EPS else " " if x == WHITE_SPACE else x
 
 
-def indexed_map_to_output(alignment: List[tuple], start: int, end: int):
+def indexed_map_to_output(alignment: List[tuple], start: int, end: int, mode: str):
     """
     Given input start and end index of contracted substring return corresponding output start and end index
 
     Args:
         alignment: alignment generated by FST with shortestpath, is longer than original string since including eps transitions
         start: inclusive start position in input string
         end: exclusive end position in input string
+        mode: grammar type for either tn or itn 
 
     Returns:
         output_og_start_index: inclusive start position in output string
         output_og_end_index: exclusive end position in output string
     """
     # get aligned start and end of input substring
+
     aligned_start = _get_aligned_index(alignment, start)
     aligned_end = _get_aligned_index(alignment, end - 1)  # inclusive
 
     logging.debug(f"0: |{list(map(remove, [x[0] for x in alignment[aligned_start:aligned_end+1]]))}|")
 
     # extend aligned_start to left
+
     while (
         aligned_start - 1 > 0
         and alignment[aligned_start - 1][0] == EPS
-        and (alignment[aligned_start - 1][1].isalpha() or alignment[aligned_start - 1][1] == EPS)
+        and (alignment[aligned_start - 1][1].isalnum() or alignment[aligned_start - 1][1] == EPS)
     ):
         aligned_start -= 1
 
     while (
         aligned_end + 1 < len(alignment)
         and alignment[aligned_end + 1][0] == EPS
-        and (alignment[aligned_end + 1][1].isalpha() or alignment[aligned_end + 1][1] == EPS)
+        and (alignment[aligned_end + 1][1].isalnum() or alignment[aligned_end + 1][1] == EPS)
     ):
         aligned_end += 1
 
-    while (aligned_end + 1) < len(alignment) and (
-        alignment[aligned_end + 1][1].isalpha() or alignment[aligned_end + 1][1] == EPS
-    ):
-        aligned_end += 1
+    if mode == TN_MODE:
+        while (aligned_end + 1) < len(alignment) and (
+            alignment[aligned_end + 1][1].isalnum() or alignment[aligned_end + 1][1] == EPS
+        ):
+            aligned_end += 1
 
     output_og_start_index = _get_original_index(alignment=alignment, aligned_index=aligned_start)
     output_og_end_index = _get_original_index(alignment=alignment, aligned_index=aligned_end + 1)
+
     return output_og_start_index, output_og_end_index
 
 
 if __name__ == '__main__':
-    logging.setLevel(logging.INFO)
+    logging.getLogger().setLevel(logging.INFO)
     args = parse_args()
     fst = Far(args.fst, mode='r')
     try:
@@ -240,14 +251,14 @@ def indexed_map_to_output(alignment: List[tuple], start: int, end: int):
 
     table = create_symbol_table()
     alignment, output_text = get_string_alignment(fst=fst, input_text=input_text, symbol_table=table)
-    print(f"inp string: |{args.text}|")
-    print(f"out string: |{output_text}|")
+    logging.info(f"inp string: |{args.text}|")
+    logging.info(f"out string: |{output_text}|")
 
     if args.start is None:
         indices = get_word_segments(input_text)
     else:
         indices = [(args.start, args.end)]
     for x in indices:
-        start, end = indexed_map_to_output(start=x[0], end=x[1], alignment=alignment)
-        print(f"inp indices: [{x[0]}:{x[1]}] out indices: [{start}:{end}]")
-        print(f"in: |{input_text[x[0]:x[1]]}| out: |{output_text[start:end]}|")
+        start, end = indexed_map_to_output(start=x[0], end=x[1], alignment=alignment, mode=args.grammar)
+        logging.info(f"inp indices: [{x[0]}:{x[1]}] out indices: [{start}:{end}]")
+        logging.info(f"in: |{input_text[x[0]:x[1]]}| out: |{output_text[start:end]}|")