From c67bcbb2008769c5319aef49d7b39b3c96d4494d Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Tue, 15 Nov 2022 10:40:03 -0800 Subject: [PATCH 1/3] Revert workers workaround Signed-off-by: MaximumEntropy --- .../conf/megatron_t5_config_finetune_glue_eval.yaml | 2 +- .../conf/megatron_t5_config_finetune_glue_mnli.yaml | 4 ++-- .../conf/megatron_t5_config_finetune_glue_xnli.yaml | 6 +++--- .../language_modeling/conf/megatron_t5_finetune.yaml | 4 ++-- .../megatron_lm_encoder_decoder_model.py | 10 +++++++++- 5 files changed, 17 insertions(+), 9 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml index 87ce5ac03eb5..b7d677db8c9a 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml @@ -28,7 +28,7 @@ model: global_batch_size: 1 micro_batch_size: 1 shuffle: False - num_workers: 0 + num_workers: 4 pin_memory: True max_seq_length: 512 drop_last: False diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml index ac68b57e0216..d0c9708e0929 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml @@ -54,7 +54,7 @@ model: global_batch_size: 128 micro_batch_size: 64 shuffle: True - num_workers: 0 + num_workers: 4 pin_memory: True max_seq_length: 512 drop_last: True @@ -65,7 +65,7 @@ model: global_batch_size: 128 micro_batch_size: 64 shuffle: False - num_workers: 0 + num_workers: 4 pin_memory: True max_seq_length: 512 drop_last: False # TODO: Figure out if there is a way to avoid dropping last. diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml index 1b08bc37246e..bfed42cf2f95 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml @@ -55,7 +55,7 @@ model: global_batch_size: 128 micro_batch_size: 64 shuffle: True - num_workers: 0 + num_workers: 4 pin_memory: True max_seq_length: 512 drop_last: True @@ -66,7 +66,7 @@ model: global_batch_size: 128 micro_batch_size: 64 shuffle: False - num_workers: 0 + num_workers: 4 pin_memory: True max_seq_length: 512 drop_last: False @@ -83,7 +83,7 @@ model: global_batch_size: 128 micro_batch_size: 64 shuffle: False - num_workers: 0 + num_workers: 4 pin_memory: True max_seq_length: 512 drop_last: False diff --git a/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml b/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml index 9a5cf15cfe74..9cf7ba2cbe8f 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml @@ -53,7 +53,7 @@ model: global_batch_size: 128 micro_batch_size: 64 shuffle: True - num_workers: 0 + num_workers: 4 pin_memory: True max_src_seq_length: 512 max_tgt_seq_length: 128 @@ -69,7 +69,7 @@ model: global_batch_size: 128 micro_batch_size: 64 shuffle: False - num_workers: 0 + num_workers: 4 pin_memory: True max_src_seq_length: 512 max_tgt_seq_length: 128 diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py index b6d70dfb649e..72c14555a8ad 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py @@ -337,6 +337,7 @@ def training_step(self, batch, batch_idx): tensor_shape=tensor_shape, decoder_sequence_length=decoder_seq_length, dtype=self.autocast_dtype, + sync_batch_comm=self.cfg.get('sync_batch_comm', False), grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, custom_sync_context_handler=custom_sync_context_handler, ) @@ -349,6 +350,7 @@ def training_step(self, batch, batch_idx): tensor_shape=tensor_shape, decoder_sequence_length=decoder_seq_length, dtype=self.autocast_dtype, + sync_batch_comm=self.cfg.get('sync_batch_comm', False), grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, custom_sync_context_handler=custom_sync_context_handler, ) @@ -657,6 +659,7 @@ def validation_step_logits(self, batch, batch_idx): tensor_shape=tensor_shape, decoder_sequence_length=decoder_seq_length, dtype=self.autocast_dtype, + sync_batch_comm=self.cfg.get('sync_batch_comm', False), grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, ) else: @@ -668,6 +671,7 @@ def validation_step_logits(self, batch, batch_idx): tensor_shape=tensor_shape, decoder_sequence_length=decoder_seq_length, dtype=self.autocast_dtype, + sync_batch_comm=self.cfg.get('sync_batch_comm', False), grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, ) @@ -700,6 +704,7 @@ def validation_step(self, batch, batch_idx): tensor_shape=tensor_shape, decoder_sequence_length=decoder_seq_length, dtype=self.autocast_dtype, + sync_batch_comm=self.cfg.get('sync_batch_comm', False), grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, ) else: @@ -711,6 +716,7 @@ def validation_step(self, batch, batch_idx): tensor_shape=tensor_shape, decoder_sequence_length=decoder_seq_length, dtype=self.autocast_dtype, + sync_batch_comm=self.cfg.get('sync_batch_comm', False), grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, ) @@ -951,7 +957,7 @@ def setup_validation_data(self, cfg): if hasattr(self, '_validation_ds'): consumed_samples = 0 self._validation_dl = self.build_pretraining_data_loader( - self._validation_ds, consumed_samples, num_workers=0 + self._validation_ds, consumed_samples, num_workers=self._cfg.data.num_workers ) def setup_test_data(self, cfg): @@ -1042,6 +1048,7 @@ def dummy(): tensor_shape=tensor_shape, decoder_sequence_length=encoder_seq_length, dtype=self.autocast_dtype, + sync_batch_comm=self.cfg.get('sync_batch_comm', False), ) else: output_tensor = forward_backward_no_pipelining( @@ -1052,6 +1059,7 @@ def dummy(): tensor_shape=tensor_shape, decoder_sequence_length=encoder_seq_length, dtype=self.autocast_dtype, + sync_batch_comm=self.cfg.get('sync_batch_comm', False), ) if output_tensor: From ef10d53beea0872f73e7e03f967f3c01d17a4cca Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Tue, 15 Nov 2022 16:32:50 -0800 Subject: [PATCH 2/3] Fix in config Signed-off-by: MaximumEntropy --- .../conf/megatron_t5_config_finetune_glue_eval.yaml | 2 +- .../conf/megatron_t5_config_finetune_glue_mnli.yaml | 4 ++-- .../conf/megatron_t5_config_finetune_glue_xnli.yaml | 4 ++-- examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml index b7d677db8c9a..87ce5ac03eb5 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml @@ -28,7 +28,7 @@ model: global_batch_size: 1 micro_batch_size: 1 shuffle: False - num_workers: 4 + num_workers: 0 pin_memory: True max_seq_length: 512 drop_last: False diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml index d0c9708e0929..ac68b57e0216 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml @@ -54,7 +54,7 @@ model: global_batch_size: 128 micro_batch_size: 64 shuffle: True - num_workers: 4 + num_workers: 0 pin_memory: True max_seq_length: 512 drop_last: True @@ -65,7 +65,7 @@ model: global_batch_size: 128 micro_batch_size: 64 shuffle: False - num_workers: 4 + num_workers: 0 pin_memory: True max_seq_length: 512 drop_last: False # TODO: Figure out if there is a way to avoid dropping last. diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml index bfed42cf2f95..7de11fbfa652 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml @@ -55,7 +55,7 @@ model: global_batch_size: 128 micro_batch_size: 64 shuffle: True - num_workers: 4 + num_workers: 0 pin_memory: True max_seq_length: 512 drop_last: True @@ -66,7 +66,7 @@ model: global_batch_size: 128 micro_batch_size: 64 shuffle: False - num_workers: 4 + num_workers: 0 pin_memory: True max_seq_length: 512 drop_last: False diff --git a/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml b/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml index 9cf7ba2cbe8f..9a5cf15cfe74 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml @@ -53,7 +53,7 @@ model: global_batch_size: 128 micro_batch_size: 64 shuffle: True - num_workers: 4 + num_workers: 0 pin_memory: True max_src_seq_length: 512 max_tgt_seq_length: 128 @@ -69,7 +69,7 @@ model: global_batch_size: 128 micro_batch_size: 64 shuffle: False - num_workers: 4 + num_workers: 0 pin_memory: True max_src_seq_length: 512 max_tgt_seq_length: 128 From b1971b6a6709f8e3499280b49fd0241d0461e04f Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Tue, 15 Nov 2022 16:42:40 -0800 Subject: [PATCH 3/3] Fix Signed-off-by: MaximumEntropy --- .../conf/megatron_t5_config_finetune_glue_xnli.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml index 7de11fbfa652..1b08bc37246e 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml @@ -83,7 +83,7 @@ model: global_batch_size: 128 micro_batch_size: 64 shuffle: False - num_workers: 4 + num_workers: 0 pin_memory: True max_seq_length: 512 drop_last: False