From c67bcbb2008769c5319aef49d7b39b3c96d4494d Mon Sep 17 00:00:00 2001
From: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Date: Tue, 15 Nov 2022 10:40:03 -0800
Subject: [PATCH 1/3] Revert workers workaround

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
---
 .../conf/megatron_t5_config_finetune_glue_eval.yaml    |  2 +-
 .../conf/megatron_t5_config_finetune_glue_mnli.yaml    |  4 ++--
 .../conf/megatron_t5_config_finetune_glue_xnli.yaml    |  6 +++---
 .../language_modeling/conf/megatron_t5_finetune.yaml   |  4 ++--
 .../megatron_lm_encoder_decoder_model.py               | 10 +++++++++-
 5 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml
index 87ce5ac03eb5..b7d677db8c9a 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml
@@ -28,7 +28,7 @@ model:
       global_batch_size: 1
       micro_batch_size: 1
       shuffle: False
-      num_workers: 0
+      num_workers: 4
       pin_memory: True
       max_seq_length: 512
       drop_last: False
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
index ac68b57e0216..d0c9708e0929 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
@@ -54,7 +54,7 @@ model:
       global_batch_size: 128
       micro_batch_size: 64
       shuffle: True
-      num_workers: 0
+      num_workers: 4
       pin_memory: True
       max_seq_length: 512
       drop_last: True
@@ -65,7 +65,7 @@ model:
       global_batch_size: 128
       micro_batch_size: 64
       shuffle: False
-      num_workers: 0
+      num_workers: 4
       pin_memory: True
       max_seq_length: 512
       drop_last: False # TODO: Figure out if there is a way to avoid dropping last.
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
index 1b08bc37246e..bfed42cf2f95 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
@@ -55,7 +55,7 @@ model:
       global_batch_size: 128
       micro_batch_size: 64
       shuffle: True
-      num_workers: 0
+      num_workers: 4
       pin_memory: True
       max_seq_length: 512
       drop_last: True
@@ -66,7 +66,7 @@ model:
       global_batch_size: 128
       micro_batch_size: 64
       shuffle: False
-      num_workers: 0
+      num_workers: 4
       pin_memory: True
       max_seq_length: 512
       drop_last: False
@@ -83,7 +83,7 @@ model:
       global_batch_size: 128
       micro_batch_size: 64
       shuffle: False
-      num_workers: 0
+      num_workers: 4
       pin_memory: True
       max_seq_length: 512
       drop_last: False
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml b/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml
index 9a5cf15cfe74..9cf7ba2cbe8f 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml
@@ -53,7 +53,7 @@ model:
       global_batch_size: 128
       micro_batch_size: 64
       shuffle: True
-      num_workers: 0
+      num_workers: 4
       pin_memory: True
       max_src_seq_length: 512
       max_tgt_seq_length: 128
@@ -69,7 +69,7 @@ model:
       global_batch_size: 128
       micro_batch_size: 64
       shuffle: False
-      num_workers: 0
+      num_workers: 4
       pin_memory: True
       max_src_seq_length: 512
       max_tgt_seq_length: 128
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index b6d70dfb649e..72c14555a8ad 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -337,6 +337,7 @@ def training_step(self, batch, batch_idx):
                 tensor_shape=tensor_shape,
                 decoder_sequence_length=decoder_seq_length,
                 dtype=self.autocast_dtype,
+                sync_batch_comm=self.cfg.get('sync_batch_comm', False),
                 grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
                 custom_sync_context_handler=custom_sync_context_handler,
             )
@@ -349,6 +350,7 @@ def training_step(self, batch, batch_idx):
                 tensor_shape=tensor_shape,
                 decoder_sequence_length=decoder_seq_length,
                 dtype=self.autocast_dtype,
+                sync_batch_comm=self.cfg.get('sync_batch_comm', False),
                 grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
                 custom_sync_context_handler=custom_sync_context_handler,
             )
@@ -657,6 +659,7 @@ def validation_step_logits(self, batch, batch_idx):
                 tensor_shape=tensor_shape,
                 decoder_sequence_length=decoder_seq_length,
                 dtype=self.autocast_dtype,
+                sync_batch_comm=self.cfg.get('sync_batch_comm', False),
                 grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
             )
         else:
@@ -668,6 +671,7 @@ def validation_step_logits(self, batch, batch_idx):
                 tensor_shape=tensor_shape,
                 decoder_sequence_length=decoder_seq_length,
                 dtype=self.autocast_dtype,
+                sync_batch_comm=self.cfg.get('sync_batch_comm', False),
                 grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
             )
 
@@ -700,6 +704,7 @@ def validation_step(self, batch, batch_idx):
                 tensor_shape=tensor_shape,
                 decoder_sequence_length=decoder_seq_length,
                 dtype=self.autocast_dtype,
+                sync_batch_comm=self.cfg.get('sync_batch_comm', False),
                 grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
             )
         else:
@@ -711,6 +716,7 @@ def validation_step(self, batch, batch_idx):
                 tensor_shape=tensor_shape,
                 decoder_sequence_length=decoder_seq_length,
                 dtype=self.autocast_dtype,
+                sync_batch_comm=self.cfg.get('sync_batch_comm', False),
                 grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
             )
 
@@ -951,7 +957,7 @@ def setup_validation_data(self, cfg):
         if hasattr(self, '_validation_ds'):
             consumed_samples = 0
             self._validation_dl = self.build_pretraining_data_loader(
-                self._validation_ds, consumed_samples, num_workers=0
+                self._validation_ds, consumed_samples, num_workers=self._cfg.data.num_workers
             )
 
     def setup_test_data(self, cfg):
@@ -1042,6 +1048,7 @@ def dummy():
                 tensor_shape=tensor_shape,
                 decoder_sequence_length=encoder_seq_length,
                 dtype=self.autocast_dtype,
+                sync_batch_comm=self.cfg.get('sync_batch_comm', False),
             )
         else:
             output_tensor = forward_backward_no_pipelining(
@@ -1052,6 +1059,7 @@ def dummy():
                 tensor_shape=tensor_shape,
                 decoder_sequence_length=encoder_seq_length,
                 dtype=self.autocast_dtype,
+                sync_batch_comm=self.cfg.get('sync_batch_comm', False),
             )
 
         if output_tensor:

From ef10d53beea0872f73e7e03f967f3c01d17a4cca Mon Sep 17 00:00:00 2001
From: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Date: Tue, 15 Nov 2022 16:32:50 -0800
Subject: [PATCH 2/3] Fix in config

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
---
 .../conf/megatron_t5_config_finetune_glue_eval.yaml           | 2 +-
 .../conf/megatron_t5_config_finetune_glue_mnli.yaml           | 4 ++--
 .../conf/megatron_t5_config_finetune_glue_xnli.yaml           | 4 ++--
 examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml
index b7d677db8c9a..87ce5ac03eb5 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml
@@ -28,7 +28,7 @@ model:
       global_batch_size: 1
       micro_batch_size: 1
       shuffle: False
-      num_workers: 4
+      num_workers: 0
       pin_memory: True
       max_seq_length: 512
       drop_last: False
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
index d0c9708e0929..ac68b57e0216 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
@@ -54,7 +54,7 @@ model:
       global_batch_size: 128
       micro_batch_size: 64
       shuffle: True
-      num_workers: 4
+      num_workers: 0
       pin_memory: True
       max_seq_length: 512
       drop_last: True
@@ -65,7 +65,7 @@ model:
       global_batch_size: 128
       micro_batch_size: 64
       shuffle: False
-      num_workers: 4
+      num_workers: 0
       pin_memory: True
       max_seq_length: 512
       drop_last: False # TODO: Figure out if there is a way to avoid dropping last.
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
index bfed42cf2f95..7de11fbfa652 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
@@ -55,7 +55,7 @@ model:
       global_batch_size: 128
       micro_batch_size: 64
       shuffle: True
-      num_workers: 4
+      num_workers: 0
       pin_memory: True
       max_seq_length: 512
       drop_last: True
@@ -66,7 +66,7 @@ model:
       global_batch_size: 128
       micro_batch_size: 64
       shuffle: False
-      num_workers: 4
+      num_workers: 0
       pin_memory: True
       max_seq_length: 512
       drop_last: False
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml b/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml
index 9cf7ba2cbe8f..9a5cf15cfe74 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml
@@ -53,7 +53,7 @@ model:
       global_batch_size: 128
       micro_batch_size: 64
       shuffle: True
-      num_workers: 4
+      num_workers: 0
       pin_memory: True
       max_src_seq_length: 512
       max_tgt_seq_length: 128
@@ -69,7 +69,7 @@ model:
       global_batch_size: 128
       micro_batch_size: 64
       shuffle: False
-      num_workers: 4
+      num_workers: 0
       pin_memory: True
       max_src_seq_length: 512
       max_tgt_seq_length: 128

From b1971b6a6709f8e3499280b49fd0241d0461e04f Mon Sep 17 00:00:00 2001
From: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Date: Tue, 15 Nov 2022 16:42:40 -0800
Subject: [PATCH 3/3] Fix

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
---
 .../conf/megatron_t5_config_finetune_glue_xnli.yaml             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
index 7de11fbfa652..1b08bc37246e 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
@@ -83,7 +83,7 @@ model:
       global_batch_size: 128
       micro_batch_size: 64
       shuffle: False
-      num_workers: 4
+      num_workers: 0
       pin_memory: True
       max_seq_length: 512
       drop_last: False