From e40693971562105686d5a5e6fb199be323d23814 Mon Sep 17 00:00:00 2001
From: PeganovAnton <peganoff2@mail.ru>
Date: Wed, 18 May 2022 23:15:09 +0300
Subject: [PATCH 1/5] Draft of fix

Signed-off-by: PeganovAnton <peganoff2@mail.ru>
---
 .../punctuation_capitalization_model.py                     | 6 ++++--
 tutorials/nlp/Punctuation_and_Capitalization.ipynb          | 5 +++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
index 5a5f6c025eea..e0a93dd86500 100644
--- a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
+++ b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
@@ -783,8 +783,10 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u
                 num_samples=cfg.num_samples,
                 tokens_in_batch=cfg.tokens_in_batch,
                 n_jobs=cfg.n_jobs,
-                number_of_batches_is_multiple_of=1 if train else self.trainer.num_nodes * self.trainer.num_devices,
-                batch_shuffling_random_seed=self.trainer.global_step if train else 42,
+                number_of_batches_is_multiple_of=(
+                    1 if train or self.trainer is None else self.trainer.num_nodes * self.trainer.num_devices
+                ),
+                batch_shuffling_random_seed=self.trainer.global_step if self.trainer is not None and train else 42,
                 verbose=cfg.verbose,
                 get_label_frequencies=cfg.get_label_frequences,
                 cache_dir=cfg.cache_dir,
diff --git a/tutorials/nlp/Punctuation_and_Capitalization.ipynb b/tutorials/nlp/Punctuation_and_Capitalization.ipynb
index 8b4b91eff699..7765222bb03e 100644
--- a/tutorials/nlp/Punctuation_and_Capitalization.ipynb
+++ b/tutorials/nlp/Punctuation_and_Capitalization.ipynb
@@ -990,14 +990,15 @@
                 "        'tokens_in_batch': 1024,\n",
                 "    },\n",
                 ")\n",
-                "pretrained_model.setup_training_data()\n",
-                "pretrained_model.setup_validation_data()\n",
                 "\n",
                 "# and now we can create a PyTorch Lightning trainer and call `fit` again\n",
                 "# for this tutorial we are setting fast_dev_run to True, and the trainer will run 1 training batch and 1 validation batch\n",
                 "# for actual model training, disable the flag\n",
                 "fast_dev_run = True\n",
                 "trainer = pl.Trainer(devices=1, accelerator='gpu', fast_dev_run=fast_dev_run)\n",
+                "pretrained_model.set_trainer(trainer)\n",
+                "pretrained_model.setup_training_data()\n",
+                "pretrained_model.setup_validation_data()\n",
                 "trainer.fit(pretrained_model)"
             ]
         }

From b806d512c339957a5694eb8204e2025032902202 Mon Sep 17 00:00:00 2001
From: PeganovAnton <peganoff2@mail.ru>
Date: Thu, 19 May 2022 00:04:58 +0300
Subject: [PATCH 2/5] Add warnings and replace globa_step with current_epoch

Signed-off-by: PeganovAnton <peganoff2@mail.ru>
---
 .../punctuation_capitalization_model.py       | 40 +++++++++++++++++--
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
index e0a93dd86500..7a45d93097ae 100644
--- a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
+++ b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import copy
+import warnings
 from math import ceil
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -770,6 +771,39 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u
                     'punct_label_vocab_file': punct_label_vocab_file,
                     'capit_label_vocab_file': capit_label_vocab_file,
                 }
+            if train:
+                number_of_batches_is_multiple_of = 1
+                if self.trainer is None:
+                    warnings.warn(
+                        f'A model attribute `trainer` is not set before training dataset setting. If training is '
+                        f'resumed from checkpoint, then current epoch data loading can be distorted: some batches '
+                        f'may be processed several times and some can be not processed at all. `trainer` attribute '
+                        f'`current_epoch` is used as random seed for shuffling batches. Now 0 is used. If the '
+                        f'checkpoint was created not during initial epoch a shuffling after training resuming can '
+                        f'be different. You may try use `exp_manager()` and '
+                        f'`PunctuationCapitalizationModel.set_trainer()` method before '
+                        f'`PunctuationCapitalizationModel.setup_training_data()` method.'
+                    )
+                    batch_shuffling_random_seed = 0
+                else:
+                    batch_shuffling_random_seed = self.trainer.current_epoch
+            else:
+                batch_shuffling_random_seed = 0
+                if self.trainer is None:
+                    warnings.warn(
+                        f'A model attribute `trainer` is not set before test dataset setting. If more than 1 GPU is '
+                        f'used for testing, then some examples may be tested several times because number of batches '
+                        f'may be not evenly divisible by number of processes. See more in description of '
+                        f'`number_of_batches_is_multiple_of` parameter of class '
+                        f'`BertPunctuationCapitalizationDataset` initializer and '
+                        f'https://pytorch.org/docs/stable/data.html#multi-process-data-loading. You may try to use '
+                        f'`PunctuationCapitalizationModel.set_trainer()` method before '
+                        f'`PunctuationCapitalizationModel.setup_validation_data()` and '
+                        f'`PunctuationCapitalizationModel.setup_test_data()` methods.'
+                    )
+                    number_of_batches_is_multiple_of = 1
+                else:
+                    number_of_batches_is_multiple_of = self.trainer.num_nodes * self.trainer.num_devices
             dataset = BertPunctuationCapitalizationDataset(
                 tokenizer=self.tokenizer,
                 text_file=text_file,
@@ -783,10 +817,8 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u
                 num_samples=cfg.num_samples,
                 tokens_in_batch=cfg.tokens_in_batch,
                 n_jobs=cfg.n_jobs,
-                number_of_batches_is_multiple_of=(
-                    1 if train or self.trainer is None else self.trainer.num_nodes * self.trainer.num_devices
-                ),
-                batch_shuffling_random_seed=self.trainer.global_step if self.trainer is not None and train else 42,
+                number_of_batches_is_multiple_of=number_of_batches_is_multiple_of,
+                batch_shuffling_random_seed=batch_shuffling_random_seed,
                 verbose=cfg.verbose,
                 get_label_frequencies=cfg.get_label_frequences,
                 cache_dir=cfg.cache_dir,

From 9676253bf39b014381f2e26fa4c6d03ce8331de0 Mon Sep 17 00:00:00 2001
From: PeganovAnton <peganoff2@mail.ru>
Date: Thu, 19 May 2022 00:10:31 +0300
Subject: [PATCH 3/5] Small improvements to warnings

Signed-off-by: PeganovAnton <peganoff2@mail.ru>
---
 .../punctuation_capitalization_model.py       | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
index 7a45d93097ae..6501f8477837 100644
--- a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
+++ b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
@@ -775,14 +775,14 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u
                 number_of_batches_is_multiple_of = 1
                 if self.trainer is None:
                     warnings.warn(
-                        f'A model attribute `trainer` is not set before training dataset setting. If training is '
-                        f'resumed from checkpoint, then current epoch data loading can be distorted: some batches '
-                        f'may be processed several times and some can be not processed at all. `trainer` attribute '
-                        f'`current_epoch` is used as random seed for shuffling batches. Now 0 is used. If the '
-                        f'checkpoint was created not during initial epoch a shuffling after training resuming can '
-                        f'be different. You may try use `exp_manager()` and '
-                        f'`PunctuationCapitalizationModel.set_trainer()` method before '
-                        f'`PunctuationCapitalizationModel.setup_training_data()` method.'
+                        'A model attribute `trainer` is not set before training dataset setting. If training is '
+                        'resumed from checkpoint, then current epoch data loading can be distorted: some batches '
+                        'may be processed several times and some can be not processed at all. `trainer.current_epoch`'
+                        ' is used as random seed for shuffling batches. Now 0 will be used. If the '
+                        'checkpoint was created not during initial epoch a shuffling of the dataset will '
+                        'be different. You may try use `exp_manager()` function and '
+                        '`PunctuationCapitalizationModel.set_trainer()` method before '
+                        '`PunctuationCapitalizationModel.setup_training_data()` method.'
                     )
                     batch_shuffling_random_seed = 0
                 else:
@@ -791,15 +791,15 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u
                 batch_shuffling_random_seed = 0
                 if self.trainer is None:
                     warnings.warn(
-                        f'A model attribute `trainer` is not set before test dataset setting. If more than 1 GPU is '
-                        f'used for testing, then some examples may be tested several times because number of batches '
-                        f'may be not evenly divisible by number of processes. See more in description of '
-                        f'`number_of_batches_is_multiple_of` parameter of class '
-                        f'`BertPunctuationCapitalizationDataset` initializer and '
-                        f'https://pytorch.org/docs/stable/data.html#multi-process-data-loading. You may try to use '
-                        f'`PunctuationCapitalizationModel.set_trainer()` method before '
-                        f'`PunctuationCapitalizationModel.setup_validation_data()` and '
-                        f'`PunctuationCapitalizationModel.setup_test_data()` methods.'
+                        'A model attribute `trainer` is not set before test or validation dataset setting. If more '
+                        'than 1 GPU is used for testing, then some examples may be tested several times because '
+                        'number of batches may be not evenly divisible by number of processes. See more in '
+                        'description of `number_of_batches_is_multiple_of` parameter of class '
+                        '`BertPunctuationCapitalizationDataset` initializer and '
+                        'https://pytorch.org/docs/stable/data.html#multi-process-data-loading. You may try to use '
+                        '`PunctuationCapitalizationModel.set_trainer()` method before '
+                        '`PunctuationCapitalizationModel.setup_validation_data()` and '
+                        '`PunctuationCapitalizationModel.setup_test_data()` methods.'
                     )
                     number_of_batches_is_multiple_of = 1
                 else:

From 45f69257fdf16d177303e912f5445fb055653a61 Mon Sep 17 00:00:00 2001
From: PeganovAnton <peganoff2@mail.ru>
Date: Thu, 19 May 2022 00:49:52 +0300
Subject: [PATCH 4/5] Error and warning messages improvements

Signed-off-by: PeganovAnton <peganoff2@mail.ru>
---
 .../punctuation_capitalization_dataset.py                   | 2 +-
 .../punctuation_capitalization_model.py                     | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py
index 48c4c683222e..929f535cfc8d 100644
--- a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py
+++ b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py
@@ -1110,7 +1110,7 @@ def _check_label_ids_loaded_from_pkl(
     ) -> None:
         if not isinstance(pkl_punct_label_ids, dict):
             raise ValueError(
-                f"Punctuation label ids loaded from features file {self.features_pkl} has wrong type "
+                f"Punctuation label ids loaded from features file {self.features_pkl} have wrong type "
                 f"{type(pkl_punct_label_ids)}"
             )
         if parameter_punct_label_ids is not None:
diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
index 6501f8477837..73f21643e95a 100644
--- a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
+++ b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
@@ -793,9 +793,9 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u
                     warnings.warn(
                         'A model attribute `trainer` is not set before test or validation dataset setting. If more '
                         'than 1 GPU is used for testing, then some examples may be tested several times because '
-                        'number of batches may be not evenly divisible by number of processes. See more in '
-                        'description of `number_of_batches_is_multiple_of` parameter of class '
-                        '`BertPunctuationCapitalizationDataset` initializer and '
+                        'number of batches may be not evenly divisible by number of processes. This leads to '
+                        'distortion of metrics. See more in description of `number_of_batches_is_multiple_of` '
+                        'parameter of class `BertPunctuationCapitalizationDataset` initializer and '
                         'https://pytorch.org/docs/stable/data.html#multi-process-data-loading. You may try to use '
                         '`PunctuationCapitalizationModel.set_trainer()` method before '
                         '`PunctuationCapitalizationModel.setup_validation_data()` and '

From 5184089296ee013e7b2119c7cb5bd7adfe124e9f Mon Sep 17 00:00:00 2001
From: PeganovAnton <peganoff2@mail.ru>
Date: Thu, 19 May 2022 10:38:17 +0300
Subject: [PATCH 5/5] Replace self.trainer with self._trainer

Signed-off-by: PeganovAnton <peganoff2@mail.ru>
---
 .../punctuation_capitalization_model.py                   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
index 73f21643e95a..5f6fa7f6164f 100644
--- a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
+++ b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
@@ -773,7 +773,7 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u
                 }
             if train:
                 number_of_batches_is_multiple_of = 1
-                if self.trainer is None:
+                if self._trainer is None:
                     warnings.warn(
                         'A model attribute `trainer` is not set before training dataset setting. If training is '
                         'resumed from checkpoint, then current epoch data loading can be distorted: some batches '
@@ -786,10 +786,10 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u
                     )
                     batch_shuffling_random_seed = 0
                 else:
-                    batch_shuffling_random_seed = self.trainer.current_epoch
+                    batch_shuffling_random_seed = self._trainer.current_epoch
             else:
                 batch_shuffling_random_seed = 0
-                if self.trainer is None:
+                if self._trainer is None:
                     warnings.warn(
                         'A model attribute `trainer` is not set before test or validation dataset setting. If more '
                         'than 1 GPU is used for testing, then some examples may be tested several times because '
@@ -803,7 +803,7 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u
                     )
                     number_of_batches_is_multiple_of = 1
                 else:
-                    number_of_batches_is_multiple_of = self.trainer.num_nodes * self.trainer.num_devices
+                    number_of_batches_is_multiple_of = self._trainer.num_nodes * self._trainer.num_devices
             dataset = BertPunctuationCapitalizationDataset(
                 tokenizer=self.tokenizer,
                 text_file=text_file,