From 381b96e619ae80dd319abe5af8d6b67e3aa075e1 Mon Sep 17 00:00:00 2001 From: Evelina Date: Thu, 20 Jul 2023 21:25:09 -0700 Subject: [PATCH 1/7] fix for empty pred_text Signed-off-by: Evelina --- .../normalize_with_audio.py | 3 +- .../text_normalization/utils_audio_based.py | 84 +++++++++++-------- 2 files changed, 49 insertions(+), 38 deletions(-) diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py index f232c04df..a69493bbd 100644 --- a/nemo_text_processing/text_normalization/normalize_with_audio.py +++ b/nemo_text_processing/text_normalization/normalize_with_audio.py @@ -141,7 +141,7 @@ def normalize( Returns: normalized text options (usually there are multiple ways of normalizing a given semiotic class) """ - if pred_text is None or self.tagger is None: + if pred_text is None or pred_text == "" or self.tagger is None: return self.normalize_non_deterministic( text=text, n_tagged=n_tagged, punct_post_process=punct_post_process, verbose=verbose ) @@ -156,6 +156,7 @@ def normalize( semiotic_spans, pred_text_spans, norm_spans, text_with_span_tags_list, masked_idx_list = get_alignment( text, det_norm, pred_text, verbose=False ) + sem_tag_idx = 0 for cur_semiotic_span, cur_pred_text, cur_deter_norm in zip(semiotic_spans, pred_text_spans, norm_spans): if len(cur_semiotic_span) == 0: diff --git a/nemo_text_processing/text_normalization/utils_audio_based.py b/nemo_text_processing/text_normalization/utils_audio_based.py index d9fc5ed19..5f3ace8cd 100644 --- a/nemo_text_processing/text_normalization/utils_audio_based.py +++ b/nemo_text_processing/text_normalization/utils_audio_based.py @@ -24,8 +24,7 @@ def _get_alignment(a: str, b: str) -> Dict: """ - - Construscts alignment between a and b + Constructs alignment between a and b Returns: a dictionary, where keys are a's word index and values is a Tuple that contains span from b, and whether it @@ -62,7 +61,7 @@ def _get_alignment(a: str, b: str) -> Dict: def adjust_boundaries(norm_raw_diffs: Dict, norm_pred_diffs: Dict, raw: str, norm: str, pred_text: str, verbose=False): """ - Adjust alignement boundaries by taking norm--raw texts and norm--pred_text alignements, and creating raw-pred_text + Adjust alignment boundaries by taking norm--raw texts and norm--pred_text alignments, and creating raw-pred_text alignment alignment. norm_raw_diffs: output of _get_alignment(norm, raw) @@ -92,10 +91,12 @@ def adjust_boundaries(norm_raw_diffs: Dict, norm_pred_diffs: Dict, raw: str, nor raw_text_mask_idx: [1, 4] """ - adjusted = [] + raw_pred_spans = [] word_id = 0 while word_id < len(norm.split()): norm_raw, norm_pred = norm_raw_diffs[word_id], norm_pred_diffs[word_id] + # if there is a mismatch in norm_raw and norm_pred, expand the boundaries of the shortest mismatch to align with the longest one + # e.g., norm_raw = (1, 2, 'match') norm_pred = (1, 5, 'non-match') => expand norm_raw until the next matching sequence or the end of string to align with norm_pred if (norm_raw[2] == MATCH and norm_pred[2] == NONMATCH) or (norm_raw[2] == NONMATCH and norm_pred[2] == MATCH): mismatched_id = word_id non_match_raw_start = norm_raw[0] @@ -114,7 +115,7 @@ def adjust_boundaries(norm_raw_diffs: Dict, norm_pred_diffs: Dict, raw: str, nor if not done: non_match_raw_end = len(raw.split()) non_match_pred_end = len(pred_text.split()) - adjusted.append( + raw_pred_spans.append( ( mismatched_id, (non_match_raw_start, non_match_raw_end, NONMATCH), @@ -122,12 +123,13 @@ def adjust_boundaries(norm_raw_diffs: Dict, norm_pred_diffs: Dict, raw: str, nor ) ) else: - adjusted.append((word_id, norm_raw, norm_pred)) + raw_pred_spans.append((word_id, norm_raw, norm_pred)) word_id += 1 - adjusted2 = [] + # aggregate neighboring spans with the same status + spans_merged_neighbors = [] last_status = None - for idx, item in enumerate(adjusted): + for idx, item in enumerate(raw_pred_spans): if last_status is None: last_status = item[1][2] raw_start = item[1][0] @@ -139,7 +141,7 @@ def adjust_boundaries(norm_raw_diffs: Dict, norm_pred_diffs: Dict, raw: str, nor raw_end = item[1][1] pred_text_end = item[2][1] else: - adjusted2.append( + spans_merged_neighbors.append( [[norm_span_start, item[0]], [raw_start, raw_end], [pred_text_start, pred_text_end], last_status] ) last_status = item[1][2] @@ -152,13 +154,13 @@ def adjust_boundaries(norm_raw_diffs: Dict, norm_pred_diffs: Dict, raw: str, nor if last_status == item[1][2]: raw_end = item[1][1] pred_text_end = item[2][1] - adjusted2.append( + spans_merged_neighbors.append( [[norm_span_start, item[0]], [raw_start, raw_end], [pred_text_start, pred_text_end], last_status] ) else: - adjusted2.append( + spans_merged_neighbors.append( [ - [adjusted[idx - 1][0], len(norm.split())], + [raw_pred_spans[idx - 1][0], len(norm.split())], [item[1][0], len(raw.split())], [item[2][0], len(pred_text.split())], item[1][2], @@ -171,10 +173,10 @@ def adjust_boundaries(norm_raw_diffs: Dict, norm_pred_diffs: Dict, raw: str, nor # increase boundaries between raw and pred_text if some spans contain empty pred_text extended_spans = [] - adjusted3 = [] + raw_norm_spans_corrected_for_pred_text = [] idx = 0 - while idx < len(adjusted2): - item = adjusted2[idx] + while idx < len(spans_merged_neighbors): + item = spans_merged_neighbors[idx] cur_semiotic = " ".join(raw_list[item[1][0] : item[1][1]]) cur_pred_text = " ".join(pred_text_list[item[2][0] : item[2][1]]) @@ -186,8 +188,8 @@ def adjust_boundaries(norm_raw_diffs: Dict, norm_pred_diffs: Dict, raw: str, nor # if cur_pred_text is an empty string if item[2][0] == item[2][1]: # for the last item - if idx == len(adjusted2) - 1 and len(adjusted3) > 0: - last_item = adjusted3[-1] + if idx == len(spans_merged_neighbors) - 1 and len(raw_norm_spans_corrected_for_pred_text) > 0: + last_item = raw_norm_spans_corrected_for_pred_text[-1] last_item[0][1] = item[0][1] last_item[1][1] = item[1][1] last_item[2][1] = item[2][1] @@ -196,29 +198,29 @@ def adjust_boundaries(norm_raw_diffs: Dict, norm_pred_diffs: Dict, raw: str, nor raw_start, raw_end = item[0] norm_start, norm_end = item[1] pred_start, pred_end = item[2] - while idx < len(adjusted2) - 1 and not ((pred_end - pred_start) > 2 and adjusted2[idx][-1] == MATCH): + while idx < len(spans_merged_neighbors) - 1 and not ((pred_end - pred_start) > 2 and spans_merged_neighbors[idx][-1] == MATCH): idx += 1 - raw_end = adjusted2[idx][0][1] - norm_end = adjusted2[idx][1][1] - pred_end = adjusted2[idx][2][1] + raw_end = spans_merged_neighbors[idx][0][1] + norm_end = spans_merged_neighbors[idx][1][1] + pred_end = spans_merged_neighbors[idx][2][1] cur_item = [[raw_start, raw_end], [norm_start, norm_end], [pred_start, pred_end], NONMATCH] - adjusted3.append(cur_item) - extended_spans.append(len(adjusted3) - 1) + raw_norm_spans_corrected_for_pred_text.append(cur_item) + extended_spans.append(len(raw_norm_spans_corrected_for_pred_text) - 1) idx += 1 else: - adjusted3.append(item) + raw_norm_spans_corrected_for_pred_text.append(item) idx += 1 semiotic_spans = [] norm_spans = [] pred_texts = [] raw_text_masked = "" - for idx, item in enumerate(adjusted3): + for idx, item in enumerate(raw_norm_spans_corrected_for_pred_text): cur_semiotic = " ".join(raw_list[item[1][0] : item[1][1]]) cur_pred_text = " ".join(pred_text_list[item[2][0] : item[2][1]]) cur_norm_span = " ".join(norm_list[item[0][0] : item[0][1]]) - if idx == len(adjusted3) - 1: + if idx == len(raw_norm_spans_corrected_for_pred_text) - 1: cur_norm_span = " ".join(norm_list[item[0][0] : len(norm_list)]) if (item[-1] == NONMATCH and cur_semiotic != cur_norm_span) or (idx in extended_spans): raw_text_masked += " " + SEMIOTIC_TAG @@ -233,27 +235,34 @@ def adjust_boundaries(norm_raw_diffs: Dict, norm_pred_diffs: Dict, raw: str, nor if verbose: print("+" * 50) - print("adjusted:") - for item in adjusted2: + print("raw_pred_spans:") + for item in spans_merged_neighbors: print(f"{raw.split()[item[1][0]: item[1][1]]} -- {pred_text.split()[item[2][0]: item[2][1]]}") print("+" * 50) - print("adjusted2:") - for item in adjusted2: + print("spans_merged_neighbors:") + for item in spans_merged_neighbors: print(f"{raw.split()[item[1][0]: item[1][1]]} -- {pred_text.split()[item[2][0]: item[2][1]]}") print("+" * 50) - print("adjusted3:") - for item in adjusted3: + print("raw_norm_spans_corrected_for_pred_text:") + for item in raw_norm_spans_corrected_for_pred_text: print(f"{raw.split()[item[1][0]: item[1][1]]} -- {pred_text.split()[item[2][0]: item[2][1]]}") print("+" * 50) return semiotic_spans, pred_texts, norm_spans, raw_text_masked_list, raw_text_mask_idx -def get_alignment(raw, norm, pred_text, verbose: bool = False): +def get_alignment(raw: str, norm: str, pred_text: str, verbose: bool = False): + """ + Aligns raw text with deterministically normalized text and ASR output, finds semiotic spans + """ + for value in [raw, norm, pred_text]: + if value is None or value == "": + return [], [], [], [], [] + norm_pred_diffs = _get_alignment(norm, pred_text) norm_raw_diffs = _get_alignment(norm, raw) - + semiotic_spans, pred_texts, norm_spans, raw_text_masked_list, raw_text_mask_idx = adjust_boundaries( norm_raw_diffs, norm_pred_diffs, raw, norm, pred_text, verbose ) @@ -271,8 +280,9 @@ def get_alignment(raw, norm, pred_text, verbose: bool = False): if __name__ == "__main__": - raw = 'This is #4 ranking on G.S.K.T.' - pred_text = 'this iss for ranking on g k p' + raw = 'This is a #4 ranking on G.S.K.T.' + pred_text = 'this iss p k for ranking on g k p' norm = 'This is nubmer four ranking on GSKT' - get_alignment(raw, norm, pred_text, True) + output = get_alignment(raw, norm, pred_text, True) + print(output) From ab7bf29ca3d91506326c884d385976f2b8f5a94c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 21 Jul 2023 04:26:58 +0000 Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/utils_audio_based.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/text_normalization/utils_audio_based.py b/nemo_text_processing/text_normalization/utils_audio_based.py index 5f3ace8cd..0f7e05865 100644 --- a/nemo_text_processing/text_normalization/utils_audio_based.py +++ b/nemo_text_processing/text_normalization/utils_audio_based.py @@ -198,7 +198,9 @@ def adjust_boundaries(norm_raw_diffs: Dict, norm_pred_diffs: Dict, raw: str, nor raw_start, raw_end = item[0] norm_start, norm_end = item[1] pred_start, pred_end = item[2] - while idx < len(spans_merged_neighbors) - 1 and not ((pred_end - pred_start) > 2 and spans_merged_neighbors[idx][-1] == MATCH): + while idx < len(spans_merged_neighbors) - 1 and not ( + (pred_end - pred_start) > 2 and spans_merged_neighbors[idx][-1] == MATCH + ): idx += 1 raw_end = spans_merged_neighbors[idx][0][1] norm_end = spans_merged_neighbors[idx][1][1] @@ -262,7 +264,7 @@ def get_alignment(raw: str, norm: str, pred_text: str, verbose: bool = False): norm_pred_diffs = _get_alignment(norm, pred_text) norm_raw_diffs = _get_alignment(norm, raw) - + semiotic_spans, pred_texts, norm_spans, raw_text_masked_list, raw_text_mask_idx = adjust_boundaries( norm_raw_diffs, norm_pred_diffs, raw, norm, pred_text, verbose ) From 2c50e4b38002e88eee22abc28b5d487f9a14757e Mon Sep 17 00:00:00 2001 From: Evelina Date: Thu, 20 Jul 2023 21:31:36 -0700 Subject: [PATCH 3/7] add unittests Signed-off-by: Evelina --- Jenkinsfile | 7 +++-- .../audio_based_utils/__init__.py | 13 ++++++++ .../test_audio_based_utils.py | 30 +++++++++++++++++++ 3 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 tests/nemo_text_processing/audio_based_utils/__init__.py create mode 100644 tests/nemo_text_processing/audio_based_utils/test_audio_based_utils.py diff --git a/Jenkinsfile b/Jenkinsfile index 2a52d09ee..5139511da 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -57,7 +57,6 @@ pipeline { } - stage('L0: Create EN TN/ITN Grammars') { when { anyOf { @@ -67,7 +66,11 @@ pipeline { } failFast true parallel { - + stage('L0: Test utils') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" pytest audio_based_utils/*.py --cpu' + } + } stage('L0: En TN grammars') { steps { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir ${EN_TN_CACHE}' diff --git a/tests/nemo_text_processing/audio_based_utils/__init__.py b/tests/nemo_text_processing/audio_based_utils/__init__.py new file mode 100644 index 000000000..4fc50543f --- /dev/null +++ b/tests/nemo_text_processing/audio_based_utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/audio_based_utils/test_audio_based_utils.py b/tests/nemo_text_processing/audio_based_utils/test_audio_based_utils.py new file mode 100644 index 000000000..cf6cf0600 --- /dev/null +++ b/tests/nemo_text_processing/audio_based_utils/test_audio_based_utils.py @@ -0,0 +1,30 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from nemo_text_processing.text_normalization.utils_audio_based import get_alignment + + +class TestAudioBasedTNUtils: + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_default(self): + raw = 'This is #4 ranking on G.S.K.T.' + pred_text = 'this iss for ranking on g k p' + norm = 'This is nubmer four ranking on GSKT' + + output = get_alignment(raw, norm, pred_text, True) + reference = (['is #4', 'G.S.K.T.'], ['iss for', 'g k p'], ['is nubmer four', 'GSKT'], ['This', '[SEMIOTIC_SPAN]', 'ranking', 'on', '[SEMIOTIC_SPAN]'], [1, 4]) + assert output == reference + From 5b2ed16b3c13f4c7f686d415b2393080cf1df436 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 21 Jul 2023 04:32:20 +0000 Subject: [PATCH 4/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../audio_based_utils/test_audio_based_utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/nemo_text_processing/audio_based_utils/test_audio_based_utils.py b/tests/nemo_text_processing/audio_based_utils/test_audio_based_utils.py index cf6cf0600..c2c8dbc97 100644 --- a/tests/nemo_text_processing/audio_based_utils/test_audio_based_utils.py +++ b/tests/nemo_text_processing/audio_based_utils/test_audio_based_utils.py @@ -25,6 +25,11 @@ def test_default(self): norm = 'This is nubmer four ranking on GSKT' output = get_alignment(raw, norm, pred_text, True) - reference = (['is #4', 'G.S.K.T.'], ['iss for', 'g k p'], ['is nubmer four', 'GSKT'], ['This', '[SEMIOTIC_SPAN]', 'ranking', 'on', '[SEMIOTIC_SPAN]'], [1, 4]) + reference = ( + ['is #4', 'G.S.K.T.'], + ['iss for', 'g k p'], + ['is nubmer four', 'GSKT'], + ['This', '[SEMIOTIC_SPAN]', 'ranking', 'on', '[SEMIOTIC_SPAN]'], + [1, 4], + ) assert output == reference - From d4a0623edfed1ba4ab1e117db367fe738baea19b Mon Sep 17 00:00:00 2001 From: Evelina Date: Mon, 24 Jul 2023 05:33:58 -0700 Subject: [PATCH 5/7] fix path Signed-off-by: Evelina --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 5139511da..b1dbaf581 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -68,7 +68,7 @@ pipeline { parallel { stage('L0: Test utils') { steps { - sh 'CUDA_VISIBLE_DEVICES="" pytest audio_based_utils/*.py --cpu' + sh 'CUDA_VISIBLE_DEVICES="" pytest nemo_text_processing/audio_based_utils/*.py --cpu' } } stage('L0: En TN grammars') { From e2ada92b5c66d19c1e62a14ff33c4b07c5ba13c5 Mon Sep 17 00:00:00 2001 From: Evelina Date: Mon, 24 Jul 2023 05:40:06 -0700 Subject: [PATCH 6/7] fix path Signed-off-by: Evelina --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index b1dbaf581..98ae43952 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -68,7 +68,7 @@ pipeline { parallel { stage('L0: Test utils') { steps { - sh 'CUDA_VISIBLE_DEVICES="" pytest nemo_text_processing/audio_based_utils/*.py --cpu' + sh 'CUDA_VISIBLE_DEVICES="" pytest nemo_text_processing/audio_based_utils/test_audio_based_utils.py --cpu' } } stage('L0: En TN grammars') { From 71172ceda71fe21def512c2d3feb39698e76ed6d Mon Sep 17 00:00:00 2001 From: Evelina Date: Tue, 15 Aug 2023 11:19:52 -0700 Subject: [PATCH 7/7] fix pytest Signed-off-by: Evelina --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 9f2a62a95..f9357c2e3 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -68,7 +68,7 @@ pipeline { parallel { stage('L0: Test utils') { steps { - sh 'CUDA_VISIBLE_DEVICES="" pytest nemo_text_processing/audio_based_utils/test_audio_based_utils.py --cpu' + sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/audio_based_utils/ --cpu' } } stage('L0: En TN grammars') {