From 7552921b25e52de9ddd589d1316c46714260d28a Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Date: Tue, 15 Nov 2022 14:06:29 -0800
Subject: [PATCH 1/7] Fixing bug when loss mask is fully zero

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
---
 .../nlp/models/language_modeling/megatron_bert_model.py    | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index 0a850289301f..e2bf31378df0 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -358,7 +358,12 @@ def loss_func(self, loss_mask, sentence_order, output_tensor):
 
         lm_loss_ = lm_loss_.float()
         loss_mask = loss_mask.float()
-        lm_loss = torch.sum(lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()
+        
+        # Sometimes when the number of tokens is very small, none of the tokens get masked for prediction. In that case loss mask is all zeros
+        if loss_mask.sum() == 0:
+            lm_loss = torch.sum(lm_loss_.view(-1))* 0.0
+        else : 
+            lm_loss = torch.sum(lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()
 
         if sop_logits is not None:
             sop_loss = F.cross_entropy(sop_logits.view(-1, 2).float(), sentence_order.view(-1), ignore_index=-1)

From 692d2ccda8b7adb2d4dace654b8c246e2f9afe2b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 15 Nov 2022 22:08:30 +0000
Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../nlp/models/language_modeling/megatron_bert_model.py     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index e2bf31378df0..317e7000d754 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -358,11 +358,11 @@ def loss_func(self, loss_mask, sentence_order, output_tensor):
 
         lm_loss_ = lm_loss_.float()
         loss_mask = loss_mask.float()
-        
+
         # Sometimes when the number of tokens is very small, none of the tokens get masked for prediction. In that case loss mask is all zeros
         if loss_mask.sum() == 0:
-            lm_loss = torch.sum(lm_loss_.view(-1))* 0.0
-        else : 
+            lm_loss = torch.sum(lm_loss_.view(-1)) * 0.0
+        else:
             lm_loss = torch.sum(lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()
 
         if sop_logits is not None:

From a4afc8d9d53cabe06d6d00bf1d29c47a90aec515 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Date: Wed, 16 Nov 2022 10:49:13 -0800
Subject: [PATCH 3/7] Update megatron_bert_model.py

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
---
 .../nlp/models/language_modeling/megatron_bert_model.py          | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index 317e7000d754..20da2a38f7ce 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -360,6 +360,7 @@ def loss_func(self, loss_mask, sentence_order, output_tensor):
         loss_mask = loss_mask.float()
 
         # Sometimes when the number of tokens is very small, none of the tokens get masked for prediction. In that case loss mask is all zeros
+        # i.e Happens when the entire batch is masked out (Practically when MBS=1 or 2, and the number of tokens in each batch is < 7 )
         if loss_mask.sum() == 0:
             lm_loss = torch.sum(lm_loss_.view(-1)) * 0.0
         else:

From 1f948372d459dbafac30428c532f4851c03d7e22 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Date: Wed, 16 Nov 2022 10:56:13 -0800
Subject: [PATCH 4/7] Update dataset_utils.py

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
---
 .../nlp/data/language_modeling/megatron/dataset_utils.py      | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py b/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
index bd071cf3f05e..a4e9e0a305c8 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
@@ -234,6 +234,10 @@ def create_masked_lm_predictions(
         return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary)
 
     num_to_predict = min(max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob))))
+    if num_to_predict < 1:
+        logging.info(
+                F' > WARNING: number of tokens is : {len(tokens)} and mask_probability is {masked_lm_prob}. None of the tokens will be masked'
+            )
 
     ngrams = np.arange(1, max_ngram_size + 1, dtype=np.int64)
     if not geometric_dist:

From 6dd0e9cf459415a1bf9d6fed0166e995405c699c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 16 Nov 2022 18:57:19 +0000
Subject: [PATCH 5/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../nlp/data/language_modeling/megatron/dataset_utils.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py b/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
index a4e9e0a305c8..587e054ad83e 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
@@ -236,8 +236,8 @@ def create_masked_lm_predictions(
     num_to_predict = min(max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob))))
     if num_to_predict < 1:
         logging.info(
-                F' > WARNING: number of tokens is : {len(tokens)} and mask_probability is {masked_lm_prob}. None of the tokens will be masked'
-            )
+            F' > WARNING: number of tokens is : {len(tokens)} and mask_probability is {masked_lm_prob}. None of the tokens will be masked'
+        )
 
     ngrams = np.arange(1, max_ngram_size + 1, dtype=np.int64)
     if not geometric_dist:

From d296a7e975eca999e3c36a029bddbb1162d54965 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Date: Wed, 16 Nov 2022 11:03:00 -0800
Subject: [PATCH 6/7] Update dataset_utils.py

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
---
 .../nlp/data/language_modeling/megatron/dataset_utils.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py b/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
index 587e054ad83e..4b1ea7b08fce 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
@@ -235,8 +235,8 @@ def create_masked_lm_predictions(
 
     num_to_predict = min(max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob))))
     if num_to_predict < 1:
-        logging.info(
-            F' > WARNING: number of tokens is : {len(tokens)} and mask_probability is {masked_lm_prob}. None of the tokens will be masked'
+        logging.warn(
+            F'Number of tokens is : {len(tokens)} and mask_probability is {masked_lm_prob}. None of the tokens will be masked'
         )
 
     ngrams = np.arange(1, max_ngram_size + 1, dtype=np.int64)

From cdfd73f1ecf1fe1207db74db6b55cac3175038fb Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Date: Wed, 16 Nov 2022 11:26:27 -0800
Subject: [PATCH 7/7] Update dataset_utils.py

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
---
 .../nlp/data/language_modeling/megatron/dataset_utils.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py b/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
index 4b1ea7b08fce..75cea0bca417 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py
@@ -235,7 +235,7 @@ def create_masked_lm_predictions(
 
     num_to_predict = min(max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob))))
     if num_to_predict < 1:
-        logging.warn(
+        logging.warning(
             F'Number of tokens is : {len(tokens)} and mask_probability is {masked_lm_prob}. None of the tokens will be masked'
         )