From 92ef96b103d6f079b7bee04633d3feb7295416be Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Wed, 22 Feb 2023 11:42:42 -0800 Subject: [PATCH 1/3] save Signed-off-by: Yang Zhang --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 836600608..03020d557 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,6 @@ **NeMo Text Processing** ========================== -**This repository is under development, please refer to https://github.com/NVIDIA/NeMo/tree/main/nemo_text_processing for full functionality. See [documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/wfst_text_normalization.html) for details.** - Introduction ------------ From 41a451e9274ef8b31fbdde136e284af58943a434 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Wed, 22 Feb 2023 11:56:30 -0800 Subject: [PATCH 2/3] save Signed-off-by: Yang Zhang --- .github/PULL_REQUEST_TEMPLATE.md | 39 ++++++++++++-------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 6858131a8..f7370ab82 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -2,38 +2,27 @@ Add a one line overview of what this PR aims to accomplish. -**Collection**: [Note which collection this PR will affect] - -# Changelog -- Add specific line by line info of high level changes in this PR. - -# Usage -* You can potentially add a usage example below - -```python -# Add a code snippet demonstrating how to use this -``` # Before your PR is "Ready for review" **Pre checks**: -- [ ] Make sure you read and followed [Contributor guidelines](https://github.com/NVIDIA/NeMo/blob/main/CONTRIBUTING.md) -- [ ] Did you write any new necessary tests? -- [ ] Did you add or update any necessary documentation? -- [ ] Does the PR affect components that are optional to install? (Ex: Numba, Pynini, Apex etc) - - [ ] Reviewer: Does the PR have correct import guards for all optional libraries? +- [ ] Have you signed your commits? Use ``git commit -s`` to sign. +- [ ] Do all unittests finish successfully before sending PR? + 1) ``pytest`` or (if your machine does not have GPU) ``pytest --cpu`` from the root folder (given you marked your test cases accordingly `@pytest.mark.run_only_on('CPU')`). + 2) Sparrowhawk tests ``bash tools/text_processing_deployment/export_grammars.sh --MODE=test ...`` +- [ ] If you are adding a new feature: Have you added test cases for both `pytest` and Sparrowhawk [here](tests/nemo_text_processing). +- [ ] Have you added ``__init__.py`` for every folder and subfolder, including `data` folder which has .TSV files? +- [ ] Have you followed codeQL results and removed unused variables and imports (report is at the bottom of the PR in github review box) ? +- [ ] Have you added the correct license header `Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.` to all newly added Python files? +- [ ] If you copied [nemo_text_processing/text_normalization/en/graph_utils.py](nemo_text_processing/text_normalization/en/graph_utils.py) your header's second line should be `Copyright 2015 and onwards Google, Inc.`. See an example [here](https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/en/graph_utils.py#L2). +- [ ] Remove import guards (`try import: ... except: ...`) if not already done. +- [ ] If you added a new language or a new feature please update the [NeMo documentation](https://github.com/NVIDIA/NeMo/blob/main/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst) (lives in different repo). +- [ ] Have you added your language support to [tools/text_processing_deployment/pynini_export.py](tools/text_processing_deployment/pynini_export.py). + + **PR Type**: - [ ] New Feature - [ ] Bugfix - [ ] Documentation If you haven't finished some of the above items you can still open "Draft" PR. - - -## Who can review? - -Anyone in the community is free to review the PR once the checks have passed. -[Contributor guidelines](https://github.com/NVIDIA/NeMo/blob/main/CONTRIBUTING.md) contains specific people who can review PRs to various areas. - -# Additional Information -* Related to # (issue) From 523cb6077c78b227b4e5ffa06bc36505bf751f44 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Wed, 22 Feb 2023 12:54:25 -0800 Subject: [PATCH 3/3] extend alignment for itn Signed-off-by: Yang Zhang --- .../fst_alignment/alignment.cpp | 4 +- .../fst_alignment/alignment.py | 45 ++++++++++++------- 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/nemo_text_processing/fst_alignment/alignment.cpp b/nemo_text_processing/fst_alignment/alignment.cpp index aba2c3bab..4395a82fd 100644 --- a/nemo_text_processing/fst_alignment/alignment.cpp +++ b/nemo_text_processing/fst_alignment/alignment.cpp @@ -30,7 +30,7 @@ typedef StdArcLookAheadFst LookaheadFst; // Usage: // g++ -std=gnu++11 -I/include/ alignment.cpp -lfst -lthrax -ldl -L/lib -// ./a.out "tokenize_and_classify" "2615 Forest Av, 1 Aug 2016" 22 26 +// ./a.out "TOKENIZE_AND_CLASSIFY" "2615 Forest Av, 1 Aug 2016" 22 26 // Output: // inp string: |2615 Forest Av, 1 Aug 2016| @@ -42,7 +42,7 @@ typedef StdArcLookAheadFst LookaheadFst; // Disclaimer: The heuristic algorithm relies on monotonous alignment and can fail in certain situations, // e.g. when word pieces are reordered by the fst, e.g. -// ./a.out "tokenize_and_classify" "$1" 0 1 +// ./a.out "TOKENIZE_AND_CLASSIFY" "$1" 0 1 // inp string: |$1| // out string: |one dollar| // inp indices: [0:1] out indices: [0:3] diff --git a/nemo_text_processing/fst_alignment/alignment.py b/nemo_text_processing/fst_alignment/alignment.py index 65727af1b..b143ade7c 100644 --- a/nemo_text_processing/fst_alignment/alignment.py +++ b/nemo_text_processing/fst_alignment/alignment.py @@ -30,7 +30,7 @@ Usage: -python alignment.py --fst= --text=\"2615 Forest Av, 1 Aug 2016\" --rule=\"tokenize_and_classify\" --start=22 --end=26 +python alignment.py --fst= --text="2615 Forest Av, 1 Aug 2016" --rule=TOKENIZE_AND_CLASSIFY --start=22 --end=26 --grammar=TN Output: inp string: |2615 Forest Av, 1 Aug 2016| @@ -40,7 +40,7 @@ in: |2016| out: |twenty sixteen| -python alignment.py --fst= --text=\"2615 Forest Av, 1 Aug 2016\" --rule=\"tokenize_and_classify\" +python alignment.py --fst= --text="2615 Forest Av, 1 Aug 2016" --rule=TOKENIZE_AND_CLASSIFY Output: inp string: |2615 Forest Av, 1 Aug 2016| @@ -74,6 +74,9 @@ def parse_args(): args = ArgumentParser("map substring to output with FST") args.add_argument("--fst", help="FAR file containing FST", type=str, required=True) + args.add_argument( + "--grammar", help="tn or itn", type=str, required=False, choices=[ITN_MODE, TN_MODE], default=TN_MODE + ) args.add_argument( "--rule", help="rule name in FAR file containing FST", @@ -94,6 +97,8 @@ def parse_args(): EPS = "" WHITE_SPACE = "\u23B5" +ITN_MODE = "itn" +TN_MODE = "tn" def get_word_segments(text: str) -> List[List[int]]: @@ -142,9 +147,10 @@ def get_string_alignment(fst: pynini.Fst, input_text: str, symbol_table: pynini. ilabels = paths.ilabels() olabels = paths.olabels() - logging.debug(paths.istring()) - logging.debug(paths.ostring()) + logging.debug("input: " + paths.istring()) + logging.debug("output: " + paths.ostring()) output = list(zip([symbol_table.find(x) for x in ilabels], [symbol_table.find(x) for x in olabels])) + logging.debug(f"alignment: {output}") paths.next() assert paths.done() output_str = "".join(map(remove, [x[1] for x in output])) @@ -184,7 +190,7 @@ def _get_original_index(alignment, aligned_index): remove = lambda x: "" if x == EPS else " " if x == WHITE_SPACE else x -def indexed_map_to_output(alignment: List[tuple], start: int, end: int): +def indexed_map_to_output(alignment: List[tuple], start: int, end: int, mode: str): """ Given input start and end index of contracted substring return corresponding output start and end index @@ -192,44 +198,49 @@ def indexed_map_to_output(alignment: List[tuple], start: int, end: int): alignment: alignment generated by FST with shortestpath, is longer than original string since including eps transitions start: inclusive start position in input string end: exclusive end position in input string + mode: grammar type for either tn or itn Returns: output_og_start_index: inclusive start position in output string output_og_end_index: exclusive end position in output string """ # get aligned start and end of input substring + aligned_start = _get_aligned_index(alignment, start) aligned_end = _get_aligned_index(alignment, end - 1) # inclusive logging.debug(f"0: |{list(map(remove, [x[0] for x in alignment[aligned_start:aligned_end+1]]))}|") # extend aligned_start to left + while ( aligned_start - 1 > 0 and alignment[aligned_start - 1][0] == EPS - and (alignment[aligned_start - 1][1].isalpha() or alignment[aligned_start - 1][1] == EPS) + and (alignment[aligned_start - 1][1].isalnum() or alignment[aligned_start - 1][1] == EPS) ): aligned_start -= 1 while ( aligned_end + 1 < len(alignment) and alignment[aligned_end + 1][0] == EPS - and (alignment[aligned_end + 1][1].isalpha() or alignment[aligned_end + 1][1] == EPS) + and (alignment[aligned_end + 1][1].isalnum() or alignment[aligned_end + 1][1] == EPS) ): aligned_end += 1 - while (aligned_end + 1) < len(alignment) and ( - alignment[aligned_end + 1][1].isalpha() or alignment[aligned_end + 1][1] == EPS - ): - aligned_end += 1 + if mode == TN_MODE: + while (aligned_end + 1) < len(alignment) and ( + alignment[aligned_end + 1][1].isalnum() or alignment[aligned_end + 1][1] == EPS + ): + aligned_end += 1 output_og_start_index = _get_original_index(alignment=alignment, aligned_index=aligned_start) output_og_end_index = _get_original_index(alignment=alignment, aligned_index=aligned_end + 1) + return output_og_start_index, output_og_end_index if __name__ == '__main__': - logging.setLevel(logging.INFO) + logging.getLogger().setLevel(logging.INFO) args = parse_args() fst = Far(args.fst, mode='r') try: @@ -240,14 +251,14 @@ def indexed_map_to_output(alignment: List[tuple], start: int, end: int): table = create_symbol_table() alignment, output_text = get_string_alignment(fst=fst, input_text=input_text, symbol_table=table) - print(f"inp string: |{args.text}|") - print(f"out string: |{output_text}|") + logging.info(f"inp string: |{args.text}|") + logging.info(f"out string: |{output_text}|") if args.start is None: indices = get_word_segments(input_text) else: indices = [(args.start, args.end)] for x in indices: - start, end = indexed_map_to_output(start=x[0], end=x[1], alignment=alignment) - print(f"inp indices: [{x[0]}:{x[1]}] out indices: [{start}:{end}]") - print(f"in: |{input_text[x[0]:x[1]]}| out: |{output_text[start:end]}|") + start, end = indexed_map_to_output(start=x[0], end=x[1], alignment=alignment, mode=args.grammar) + logging.info(f"inp indices: [{x[0]}:{x[1]}] out indices: [{start}:{end}]") + logging.info(f"in: |{input_text[x[0]:x[1]]}| out: |{output_text[start:end]}|")