From e40693971562105686d5a5e6fb199be323d23814 Mon Sep 17 00:00:00 2001 From: PeganovAnton Date: Wed, 18 May 2022 23:15:09 +0300 Subject: [PATCH 1/5] Draft of fix Signed-off-by: PeganovAnton --- .../punctuation_capitalization_model.py | 6 ++++-- tutorials/nlp/Punctuation_and_Capitalization.ipynb | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py index 5a5f6c025eea..e0a93dd86500 100644 --- a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py +++ b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py @@ -783,8 +783,10 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u num_samples=cfg.num_samples, tokens_in_batch=cfg.tokens_in_batch, n_jobs=cfg.n_jobs, - number_of_batches_is_multiple_of=1 if train else self.trainer.num_nodes * self.trainer.num_devices, - batch_shuffling_random_seed=self.trainer.global_step if train else 42, + number_of_batches_is_multiple_of=( + 1 if train or self.trainer is None else self.trainer.num_nodes * self.trainer.num_devices + ), + batch_shuffling_random_seed=self.trainer.global_step if self.trainer is not None and train else 42, verbose=cfg.verbose, get_label_frequencies=cfg.get_label_frequences, cache_dir=cfg.cache_dir, diff --git a/tutorials/nlp/Punctuation_and_Capitalization.ipynb b/tutorials/nlp/Punctuation_and_Capitalization.ipynb index 8b4b91eff699..7765222bb03e 100644 --- a/tutorials/nlp/Punctuation_and_Capitalization.ipynb +++ b/tutorials/nlp/Punctuation_and_Capitalization.ipynb @@ -990,14 +990,15 @@ " 'tokens_in_batch': 1024,\n", " },\n", ")\n", - "pretrained_model.setup_training_data()\n", - "pretrained_model.setup_validation_data()\n", "\n", "# and now we can create a PyTorch Lightning trainer and call `fit` again\n", "# for this tutorial we are setting fast_dev_run to True, and the trainer will run 1 training batch and 1 validation batch\n", "# for actual model training, disable the flag\n", "fast_dev_run = True\n", "trainer = pl.Trainer(devices=1, accelerator='gpu', fast_dev_run=fast_dev_run)\n", + "pretrained_model.set_trainer(trainer)\n", + "pretrained_model.setup_training_data()\n", + "pretrained_model.setup_validation_data()\n", "trainer.fit(pretrained_model)" ] } From b806d512c339957a5694eb8204e2025032902202 Mon Sep 17 00:00:00 2001 From: PeganovAnton Date: Thu, 19 May 2022 00:04:58 +0300 Subject: [PATCH 2/5] Add warnings and replace globa_step with current_epoch Signed-off-by: PeganovAnton --- .../punctuation_capitalization_model.py | 40 +++++++++++++++++-- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py index e0a93dd86500..7a45d93097ae 100644 --- a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py +++ b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py @@ -13,6 +13,7 @@ # limitations under the License. import copy +import warnings from math import ceil from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union @@ -770,6 +771,39 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u 'punct_label_vocab_file': punct_label_vocab_file, 'capit_label_vocab_file': capit_label_vocab_file, } + if train: + number_of_batches_is_multiple_of = 1 + if self.trainer is None: + warnings.warn( + f'A model attribute `trainer` is not set before training dataset setting. If training is ' + f'resumed from checkpoint, then current epoch data loading can be distorted: some batches ' + f'may be processed several times and some can be not processed at all. `trainer` attribute ' + f'`current_epoch` is used as random seed for shuffling batches. Now 0 is used. If the ' + f'checkpoint was created not during initial epoch a shuffling after training resuming can ' + f'be different. You may try use `exp_manager()` and ' + f'`PunctuationCapitalizationModel.set_trainer()` method before ' + f'`PunctuationCapitalizationModel.setup_training_data()` method.' + ) + batch_shuffling_random_seed = 0 + else: + batch_shuffling_random_seed = self.trainer.current_epoch + else: + batch_shuffling_random_seed = 0 + if self.trainer is None: + warnings.warn( + f'A model attribute `trainer` is not set before test dataset setting. If more than 1 GPU is ' + f'used for testing, then some examples may be tested several times because number of batches ' + f'may be not evenly divisible by number of processes. See more in description of ' + f'`number_of_batches_is_multiple_of` parameter of class ' + f'`BertPunctuationCapitalizationDataset` initializer and ' + f'https://pytorch.org/docs/stable/data.html#multi-process-data-loading. You may try to use ' + f'`PunctuationCapitalizationModel.set_trainer()` method before ' + f'`PunctuationCapitalizationModel.setup_validation_data()` and ' + f'`PunctuationCapitalizationModel.setup_test_data()` methods.' + ) + number_of_batches_is_multiple_of = 1 + else: + number_of_batches_is_multiple_of = self.trainer.num_nodes * self.trainer.num_devices dataset = BertPunctuationCapitalizationDataset( tokenizer=self.tokenizer, text_file=text_file, @@ -783,10 +817,8 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u num_samples=cfg.num_samples, tokens_in_batch=cfg.tokens_in_batch, n_jobs=cfg.n_jobs, - number_of_batches_is_multiple_of=( - 1 if train or self.trainer is None else self.trainer.num_nodes * self.trainer.num_devices - ), - batch_shuffling_random_seed=self.trainer.global_step if self.trainer is not None and train else 42, + number_of_batches_is_multiple_of=number_of_batches_is_multiple_of, + batch_shuffling_random_seed=batch_shuffling_random_seed, verbose=cfg.verbose, get_label_frequencies=cfg.get_label_frequences, cache_dir=cfg.cache_dir, From 9676253bf39b014381f2e26fa4c6d03ce8331de0 Mon Sep 17 00:00:00 2001 From: PeganovAnton Date: Thu, 19 May 2022 00:10:31 +0300 Subject: [PATCH 3/5] Small improvements to warnings Signed-off-by: PeganovAnton --- .../punctuation_capitalization_model.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py index 7a45d93097ae..6501f8477837 100644 --- a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py +++ b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py @@ -775,14 +775,14 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u number_of_batches_is_multiple_of = 1 if self.trainer is None: warnings.warn( - f'A model attribute `trainer` is not set before training dataset setting. If training is ' - f'resumed from checkpoint, then current epoch data loading can be distorted: some batches ' - f'may be processed several times and some can be not processed at all. `trainer` attribute ' - f'`current_epoch` is used as random seed for shuffling batches. Now 0 is used. If the ' - f'checkpoint was created not during initial epoch a shuffling after training resuming can ' - f'be different. You may try use `exp_manager()` and ' - f'`PunctuationCapitalizationModel.set_trainer()` method before ' - f'`PunctuationCapitalizationModel.setup_training_data()` method.' + 'A model attribute `trainer` is not set before training dataset setting. If training is ' + 'resumed from checkpoint, then current epoch data loading can be distorted: some batches ' + 'may be processed several times and some can be not processed at all. `trainer.current_epoch`' + ' is used as random seed for shuffling batches. Now 0 will be used. If the ' + 'checkpoint was created not during initial epoch a shuffling of the dataset will ' + 'be different. You may try use `exp_manager()` function and ' + '`PunctuationCapitalizationModel.set_trainer()` method before ' + '`PunctuationCapitalizationModel.setup_training_data()` method.' ) batch_shuffling_random_seed = 0 else: @@ -791,15 +791,15 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u batch_shuffling_random_seed = 0 if self.trainer is None: warnings.warn( - f'A model attribute `trainer` is not set before test dataset setting. If more than 1 GPU is ' - f'used for testing, then some examples may be tested several times because number of batches ' - f'may be not evenly divisible by number of processes. See more in description of ' - f'`number_of_batches_is_multiple_of` parameter of class ' - f'`BertPunctuationCapitalizationDataset` initializer and ' - f'https://pytorch.org/docs/stable/data.html#multi-process-data-loading. You may try to use ' - f'`PunctuationCapitalizationModel.set_trainer()` method before ' - f'`PunctuationCapitalizationModel.setup_validation_data()` and ' - f'`PunctuationCapitalizationModel.setup_test_data()` methods.' + 'A model attribute `trainer` is not set before test or validation dataset setting. If more ' + 'than 1 GPU is used for testing, then some examples may be tested several times because ' + 'number of batches may be not evenly divisible by number of processes. See more in ' + 'description of `number_of_batches_is_multiple_of` parameter of class ' + '`BertPunctuationCapitalizationDataset` initializer and ' + 'https://pytorch.org/docs/stable/data.html#multi-process-data-loading. You may try to use ' + '`PunctuationCapitalizationModel.set_trainer()` method before ' + '`PunctuationCapitalizationModel.setup_validation_data()` and ' + '`PunctuationCapitalizationModel.setup_test_data()` methods.' ) number_of_batches_is_multiple_of = 1 else: From 45f69257fdf16d177303e912f5445fb055653a61 Mon Sep 17 00:00:00 2001 From: PeganovAnton Date: Thu, 19 May 2022 00:49:52 +0300 Subject: [PATCH 4/5] Error and warning messages improvements Signed-off-by: PeganovAnton --- .../punctuation_capitalization_dataset.py | 2 +- .../punctuation_capitalization_model.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py index 48c4c683222e..929f535cfc8d 100644 --- a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py +++ b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py @@ -1110,7 +1110,7 @@ def _check_label_ids_loaded_from_pkl( ) -> None: if not isinstance(pkl_punct_label_ids, dict): raise ValueError( - f"Punctuation label ids loaded from features file {self.features_pkl} has wrong type " + f"Punctuation label ids loaded from features file {self.features_pkl} have wrong type " f"{type(pkl_punct_label_ids)}" ) if parameter_punct_label_ids is not None: diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py index 6501f8477837..73f21643e95a 100644 --- a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py +++ b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py @@ -793,9 +793,9 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u warnings.warn( 'A model attribute `trainer` is not set before test or validation dataset setting. If more ' 'than 1 GPU is used for testing, then some examples may be tested several times because ' - 'number of batches may be not evenly divisible by number of processes. See more in ' - 'description of `number_of_batches_is_multiple_of` parameter of class ' - '`BertPunctuationCapitalizationDataset` initializer and ' + 'number of batches may be not evenly divisible by number of processes. This leads to ' + 'distortion of metrics. See more in description of `number_of_batches_is_multiple_of` ' + 'parameter of class `BertPunctuationCapitalizationDataset` initializer and ' 'https://pytorch.org/docs/stable/data.html#multi-process-data-loading. You may try to use ' '`PunctuationCapitalizationModel.set_trainer()` method before ' '`PunctuationCapitalizationModel.setup_validation_data()` and ' From 5184089296ee013e7b2119c7cb5bd7adfe124e9f Mon Sep 17 00:00:00 2001 From: PeganovAnton Date: Thu, 19 May 2022 10:38:17 +0300 Subject: [PATCH 5/5] Replace self.trainer with self._trainer Signed-off-by: PeganovAnton --- .../punctuation_capitalization_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py index 73f21643e95a..5f6fa7f6164f 100644 --- a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py +++ b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py @@ -773,7 +773,7 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u } if train: number_of_batches_is_multiple_of = 1 - if self.trainer is None: + if self._trainer is None: warnings.warn( 'A model attribute `trainer` is not set before training dataset setting. If training is ' 'resumed from checkpoint, then current epoch data loading can be distorted: some batches ' @@ -786,10 +786,10 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u ) batch_shuffling_random_seed = 0 else: - batch_shuffling_random_seed = self.trainer.current_epoch + batch_shuffling_random_seed = self._trainer.current_epoch else: batch_shuffling_random_seed = 0 - if self.trainer is None: + if self._trainer is None: warnings.warn( 'A model attribute `trainer` is not set before test or validation dataset setting. If more ' 'than 1 GPU is used for testing, then some examples may be tested several times because ' @@ -803,7 +803,7 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, train: bool) -> torch.u ) number_of_batches_is_multiple_of = 1 else: - number_of_batches_is_multiple_of = self.trainer.num_nodes * self.trainer.num_devices + number_of_batches_is_multiple_of = self._trainer.num_nodes * self._trainer.num_devices dataset = BertPunctuationCapitalizationDataset( tokenizer=self.tokenizer, text_file=text_file,