From 2da5a9eee3b2d97e165b68379e50157673717381 Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Mon, 6 Jun 2022 22:19:51 -0600 Subject: [PATCH 1/4] Update container to 22.05 (#4329) * update container to 22.05 Signed-off-by: ericharper * try adding safe directory Signed-off-by: ericharper * try env var Signed-off-by: ericharper * printenv Signed-off-by: ericharper * try GIT_BRANCH Signed-off-by: ericharper * typo Signed-off-by: ericharper * remove dbug statements Signed-off-by: ericharper Signed-off-by: stevehuang52 --- Jenkinsfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index fd9c57df66a7..eedf2dfcdc27 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -15,7 +15,6 @@ pipeline { stage('Add git safe directory'){ steps{ sh 'git config --global --add safe.directory /var/lib/jenkins/workspace/NeMo_$GIT_BRANCH' - sh 'git config --global --add safe.directory /raid/JenkinsWorkDir/workspace/NeMo_$GIT_BRANCH' } } From b390f1e764e28186706ca8889694c1a84a61a6d8 Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Tue, 7 Jun 2022 15:03:40 -0600 Subject: [PATCH 2/4] Merge r1.9.0 main (#4331) * update branch Signed-off-by: ericharper * update package info Signed-off-by: ericharper * cleaned up TN/ ITN doc (#4119) * cleaned up TN/ ITN doc Signed-off-by: Yang Zhang * fix typo Signed-off-by: Yang Zhang * fix image Signed-off-by: Yang Zhang * fix image Signed-off-by: Yang Zhang * Draft: Fix restoring from checkpoint for case when `model.common_dataset_parameters.label_vocab_dir` is provided (#4136) * Fix restoring from checkpoint with label vocab dir Signed-off-by: PeganovAnton * Add tests for various ways to pass label ids to model Signed-off-by: PeganovAnton * Fix typo Signed-off-by: PeganovAnton * Fix typo Signed-off-by: PeganovAnton * Do not create tmp directory Signed-off-by: PeganovAnton * Fix parameter name Signed-off-by: PeganovAnton * finish cherry-pick op Signed-off-by: PeganovAnton * Fix labels errors Signed-off-by: PeganovAnton * Remove duplicate stage Signed-off-by: PeganovAnton * Change target branch Signed-off-by: PeganovAnton * fix doc (#4146) Signed-off-by: Yang Zhang * Tacotron2 retrain (#4103) * fix yaml Signed-off-by: treacker * Fix for new TTSDataset class Signed-off-by: treacker * added wandb logging Signed-off-by: treacker * added wandb logging Signed-off-by: treacker * fix numpy version Signed-off-by: treacker * fix numpy version Signed-off-by: treacker * inference fix Signed-off-by: treacker * removed old code Signed-off-by: treacker * updated parser logic Signed-off-by: treacker * reverted version update Signed-off-by: treacker * refactored parser logic Signed-off-by: treacker * Updated Jenkinsfile Signed-off-by: treacker * Refactored tutorial for Tacotron2 Signed-off-by: treacker * Made backward compatibility Signed-off-by: treacker * Made backward compatibility Signed-off-by: treacker * Update Jenkinsfile Signed-off-by: treacker * Update tacotron.yaml Signed-off-by: treacker * Refactoring Signed-off-by: treacker * cleaned up TN/ ITN doc (#4119) * cleaned up TN/ ITN doc Signed-off-by: Yang Zhang * fix typo Signed-off-by: Yang Zhang * fix image Signed-off-by: Yang Zhang * fix image Signed-off-by: Yang Zhang Signed-off-by: treacker * Check implicit grad acc in GLUE dataset building (#4123) * Check implicit grad acc in GLUE dataset building Signed-off-by: MaximumEntropy * Fix jenkins test for GLUE/XNLI Signed-off-by: MaximumEntropy Signed-off-by: treacker * Refactoring Signed-off-by: treacker * Refactoring Signed-off-by: treacker * Fixed jenkins Signed-off-by: treacker * Refactoring Signed-off-by: treacker * Refactoring Signed-off-by: treacker * Refactoring Signed-off-by: treacker Co-authored-by: Yang Zhang Co-authored-by: Sandeep Subramanian * Multiprocess improvements (#4127) * initial commit Signed-off-by: nithinraok * start fix Signed-off-by: nithinraok * improve multiprocessing speed while creating speaker dataset Signed-off-by: nithinraok * updated scp to filelist Signed-off-by: nithinraok * notebooks' link, typo and import fix (#4158) * redo missing pr 4007 Signed-off-by: fayejf * remove extremely unreliable links Signed-off-by: fayejf * update speaker docs (#4164) * update speaker docs Signed-off-by: nithinraok * chunks -> segments Signed-off-by: nithinraok * Khz -> kHz Signed-off-by: nithinraok * small fix (#4180) Signed-off-by: fayejf * fix the server key value problem (#4196) Signed-off-by: Yi Dong * Fix/punctuation/trainer required for setting test data (#4199) * Draft of fix Signed-off-by: PeganovAnton * Add warnings and replace globa_step with current_epoch Signed-off-by: PeganovAnton * Small improvements to warnings Signed-off-by: PeganovAnton * Error and warning messages improvements Signed-off-by: PeganovAnton * Replace self.trainer with self._trainer Signed-off-by: PeganovAnton * Update ContextNet version (#4207) Signed-off-by: smajumdar * fix bugs for dialogue tutorial (#4211) Signed-off-by: Zhilin Wang * Dialogue tutorial fix (#4214) * fix bugs for dialogue tutorial Signed-off-by: Zhilin Wang * update path for convert_datasets.py due to conflict PR Signed-off-by: Zhilin Wang * Add docs for Thutmose Tagger (#4173) * Add docs for Thutmose Tagger Signed-off-by: Alexandra Antonova * add level in docs Signed-off-by: Alexandra Antonova * delete folder to avoid error with running when folder exists from previous run Signed-off-by: Alexandra Antonova Co-authored-by: Alexandra Antonova Co-authored-by: ekmb * Dialogue tutorial fix (#4218) * fix bugs for dialogue tutorial Signed-off-by: Zhilin Wang * update path for convert_datasets.py due to conflict PR Signed-off-by: Zhilin Wang * restore previously deleted files Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * Dialogue tutorial fix (#4221) * fix bugs for dialogue tutorial Signed-off-by: Zhilin Wang * update path for convert_datasets.py due to conflict PR Signed-off-by: Zhilin Wang * restore previously deleted files Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * update tutorial Signed-off-by: Zhilin Wang * fix syntax error in ipynb-file (#4228) Signed-off-by: Alexandra Antonova Co-authored-by: Alexandra Antonova * fix json serialize (#4235) Signed-off-by: Yi Dong * Prompt Learning Typo Fixes (#4238) * Prompt tuning notebook typo fixes Signed-off-by: Virginia Adams * Update tutorials.rst * Update prompt_learning.rst * Update prompt_learning.rst * fixing bug 3642622 (#4250) * fixing bug 3642622 Signed-off-by: Ghasem Pasandi * fixing bug 3642622 Signed-off-by: Ghasem Pasandi Co-authored-by: Ghasem Pasandi * fix broken link in the tutorial (#4257) Signed-off-by: Alexandra Antonova Co-authored-by: Alexandra Antonova * Typo fix, branch change, better download messagae (#4262) Signed-off-by: Virginia Adams * Raise error if bicleaner is not installed in NMT Data preprocesing notebook (#4264) * Raise error if bicleaner is not installed Signed-off-by: MaximumEntropy * Clear cells Signed-off-by: MaximumEntropy * Fix missing validation dataset, whitelist certain keywords for datasets (#4269) * Fix missing validation dataset, whitelist certain keywords for datasets Signed-off-by: smajumdar * Fix missing validation dataset, whitelist certain keywords for datasets Signed-off-by: smajumdar * Update asr configs with num_workers and pin_memory (#4270) Signed-off-by: smajumdar * Fix epoch end (#4265) Signed-off-by: MaximumEntropy Co-authored-by: Eric Harper * Set Save on train end to false (#4274) * Set Save on train end to false Signed-off-by: Virginia Adams * Update prompt_learning.rst * Update prompt_learning.rst * Update YAML (#4261) Signed-off-by: MaximumEntropy * Updated config to fix CI test OOM error (#4279) * Updated config to fix CI test issue Signed-off-by: Virginia Adams * Increased num workers Signed-off-by: Virginia Adams * verbose k2 install, skip if failed (#4289) Signed-off-by: Aleksandr Laptev Co-authored-by: Aleksandr Laptev * Changed total virtual prompt tokens (#4295) * Changed total virtual prompt tokens Signed-off-by: Virginia Adams * put number of workers back Signed-off-by: Virginia Adams * upper bound lightning Signed-off-by: ericharper * update branch Signed-off-by: ericharper * update config Signed-off-by: ericharper * remove duplicate test Signed-off-by: ericharper * fix tn test cases Signed-off-by: ericharper * add another safe.directory Signed-off-by: ericharper * typo Signed-off-by: ericharper Co-authored-by: Yang Zhang Co-authored-by: PeganovAnton Co-authored-by: treacker <36159472+treacker@users.noreply.github.com> Co-authored-by: Sandeep Subramanian Co-authored-by: Nithin Rao Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> Co-authored-by: Yi Dong <43824965+yidong72@users.noreply.github.com> Co-authored-by: Somshubra Majumdar Co-authored-by: Zhilin Wang Co-authored-by: bene-ges <61418381+bene-ges@users.noreply.github.com> Co-authored-by: Alexandra Antonova Co-authored-by: ekmb Co-authored-by: Virginia Adams <78445382+vadam5@users.noreply.github.com> Co-authored-by: Ghasem <35242805+pasandi20@users.noreply.github.com> Co-authored-by: Ghasem Pasandi Co-authored-by: Aleksandr Laptev Co-authored-by: Aleksandr Laptev Signed-off-by: stevehuang52 --- Jenkinsfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Jenkinsfile b/Jenkinsfile index eedf2dfcdc27..fd9c57df66a7 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -15,6 +15,7 @@ pipeline { stage('Add git safe directory'){ steps{ sh 'git config --global --add safe.directory /var/lib/jenkins/workspace/NeMo_$GIT_BRANCH' + sh 'git config --global --add safe.directory /raid/JenkinsWorkDir/workspace/NeMo_$GIT_BRANCH' } } From c1ced75d60e0bdc3d89f9dbb5fa0a48d41d3d69c Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 7 Jun 2022 15:37:32 -0400 Subject: [PATCH 3/4] fix full_randn bucket hang Signed-off-by: stevehuang52 --- nemo/collections/asr/modules/rnnt.py | 30 ++++++---------------------- 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/nemo/collections/asr/modules/rnnt.py b/nemo/collections/asr/modules/rnnt.py index e8d5a8efdc56..b63ddc81578a 100644 --- a/nemo/collections/asr/modules/rnnt.py +++ b/nemo/collections/asr/modules/rnnt.py @@ -914,31 +914,14 @@ def forward( else: losses = None - # Compute WER for sub batch + # Update WER for sub batch if compute_wer: sub_enc = sub_enc.transpose(1, 2) # [B, T, D] -> [B, D, T] sub_enc = sub_enc.detach() sub_transcripts = sub_transcripts.detach() - original_log_prediction = self.wer.log_prediction - if original_log_prediction and batch_idx == 0: - self.wer.log_prediction = True - else: - self.wer.log_prediction = False - - # Compute the wer (with logging for just 1st sub-batch) + # Update WER on each process without syncing self.wer.update(sub_enc, sub_enc_lens, sub_transcripts, sub_transcript_lens) - wer, wer_num, wer_denom = self.wer.compute() - self.wer.reset() - - wer_numer_list.append(wer_num) - wer_denom_list.append(wer_denom) - - # Reset logging default - self.wer.log_prediction = original_log_prediction - - else: - wer = None del sub_enc, sub_transcripts, sub_enc_lens, sub_transcript_lens @@ -951,12 +934,11 @@ def forward( # Collect sub batch wer results if compute_wer: - wer_num = torch.tensor(wer_numer_list, dtype=torch.long) - wer_denom = torch.tensor(wer_denom_list, dtype=torch.long) - - wer_num = wer_num.sum() # global sum of correct words/chars - wer_denom = wer_denom.sum() # global sum of all words/chars + # Sync and all_reduce on all processes, compute global WER + wer, wer_num, wer_denom = self.wer.compute() + self.wer.reset() else: + wer = None wer_num = None wer_denom = None From 7503d47e44775417c1dffa32f435e9480c5d3c69 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 7 Jun 2022 16:40:48 -0400 Subject: [PATCH 4/4] remove unused variables Signed-off-by: stevehuang52 --- nemo/collections/asr/modules/rnnt.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nemo/collections/asr/modules/rnnt.py b/nemo/collections/asr/modules/rnnt.py index b63ddc81578a..4604bc27fe9f 100644 --- a/nemo/collections/asr/modules/rnnt.py +++ b/nemo/collections/asr/modules/rnnt.py @@ -845,8 +845,6 @@ def forward( ) losses = [] - wer_numer_list = [] - wer_denom_list = [] batch_size = int(encoder_outputs.size(0)) # actual batch size # Iterate over batch using fused_batch_size steps