From c8ca1de6384b10bede39d4158cd2cf2e97d1cb6f Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Mon, 9 Dec 2024 13:00:43 +0100
Subject: [PATCH 1/3] Pass number of experts to modelopt layer spec

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 nemo/collections/llm/quantization/utils.py           |  2 +-
 .../megatron/gpt_layer_modelopt_spec.py              | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py
index 57022d9d3e98..6771e91e7e6d 100644
--- a/nemo/collections/llm/quantization/utils.py
+++ b/nemo/collections/llm/quantization/utils.py
@@ -55,7 +55,7 @@ def quantizable_model_config(model_cfg: llm.GPTConfig) -> llm.GPTConfig:
         get_gpt_layer_modelopt_spec,
     )
 
-    model_cfg.transformer_layer_spec = get_gpt_layer_modelopt_spec()
+    model_cfg.transformer_layer_spec = get_gpt_layer_modelopt_spec(num_experts=model_cfg.num_moe_experts)
     if model_cfg.sequence_parallel:
         logging.warning("Disabling sequence parallelism for quantization...")
         model_cfg.sequence_parallel = False
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
index 046e032093b1..514ef62a9ff3 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Optional
+
 try:
     from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm
     from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
@@ -37,7 +39,7 @@
 
 
 # Use this spec for Model Optimizer PTQ and TensorRT-LLM export
-def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
+def get_gpt_layer_modelopt_spec(num_experts: Optional[int] = None) -> ModuleSpec:
     """Mix the native spec with TENorm and TEDotProductAttention.
 
     This is essentially the native local spec except for the layernorm implementation
@@ -45,6 +47,12 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
     prevents the apex dependency.
 
     TEDotProductAttention is used to support sliding window attention.
+
+    Args:
+        num_experts (int): Number of experts. Defaults to None.
+
+    Returns:
+        ModuleSpec: Module specification with Megatron-Core modules.
     """
     if not HAVE_MEGATRON_CORE:
         raise IMPORT_ERROR
@@ -79,7 +87,7 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
 
 
 # Helper function to get module spec for MLP/MoE
-def _get_mlp_module_spec(num_experts: int = None) -> ModuleSpec:
+def _get_mlp_module_spec(num_experts: Optional[int] = None) -> ModuleSpec:
     if num_experts is None:
         # Dense MLP w/ or w/o TE modules.
         return ModuleSpec(

From 876c4303c79b7be9177641ce128f3adbc20d238c Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Mon, 9 Dec 2024 15:25:23 +0100
Subject: [PATCH 2/3] modelopt 0.21.0 update

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 Dockerfile.ci                                               | 2 +-
 examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml   | 4 ++--
 .../tuning/conf/megatron_gpt_qat_config.yaml                | 4 ++--
 nemo/collections/llm/quantization/quantizer.py              | 4 ++--
 nemo/collections/llm/quantization/utils.py                  | 6 +++---
 nemo/export/quantize/quantizer.py                           | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/Dockerfile.ci b/Dockerfile.ci
index 1d4173f9689c..e93d00d03195 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
-ARG MODELOPT_VERSION=0.19.0
+ARG MODELOPT_VERSION=0.21.0
 ARG MCORE_TAG=bd677bfb13ac2f19deaa927adc6da6f9201d66aa
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
index 62f0e452d3b5..ff8d8ca7c944 100644
--- a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
@@ -32,7 +32,7 @@ model:
   activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
 
 quantization:
-  decoder_type: ${export.decoder_type} # gptnext, gpt2, llama
+  decoder_type: ${export.decoder_type} # gpt, llama
   algorithm: fp8 # null, int8_sq, fp8, int4_awq
   calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset
   num_calib_size: 512 # number of samples used for calibration
@@ -41,7 +41,7 @@ quantization:
   enable_kv_cache: null # Enable FP8 KV cache quantization. Set to null for automatic selection.
 
 export:
-  decoder_type: llama # gptnext, gpt2, llama
+  decoder_type: llama # gpt, llama
   inference_tensor_parallel: 1 # Default using 1 TP for inference
   inference_pipeline_parallel: 1 # Default using 1 PP for inference
   dtype: 16 # Default precision data type for non-quantized layers: 16 or bf16
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml
index 09e00f8be110..35b0257b743b 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml
@@ -190,7 +190,7 @@ model:
       reduce_on_plateau: false
 
 quantization:
-  decoder_type: ${export.decoder_type} # gptnext, gpt2, llama
+  decoder_type: ${export.decoder_type} # gpt, llama
   algorithm: int4 # null, int8_sq, fp8, int4_awq, int4
   num_calib_size: 512 # number of samples used for calibration
   awq_block_size: 128 # block size for scaling factors (only used in AWQ algorithms)
@@ -198,7 +198,7 @@ quantization:
   enable_kv_cache: false # Enable FP8 KV cache quantization. Set to null for automatic selection.
 
 export:
-  decoder_type: llama # gptnext, gpt2, llama
+  decoder_type: llama # gpt, llama
   inference_tensor_parallel: 1 # Default using 1 TP for inference
   inference_pipeline_parallel: 1 # Default using 1 PP for inference
   dtype: ${trainer.precision} # Default precision data type
diff --git a/nemo/collections/llm/quantization/quantizer.py b/nemo/collections/llm/quantization/quantizer.py
index 4779cc3915c8..16ae1319e733 100644
--- a/nemo/collections/llm/quantization/quantizer.py
+++ b/nemo/collections/llm/quantization/quantizer.py
@@ -198,7 +198,7 @@ def quantize(self, model: MegatronParallel, forward_loop=None):
         # TODO: Investigate why enabling FP8 kv cache will cause accuracy regressions for Nemotron.
         enable_quant_kv_cache = self.quantization_config.enable_kv_cache
         if enable_quant_kv_cache is None:
-            enable_quant_kv_cache = "int8" not in algorithm and decoder_type != "gptnext"
+            enable_quant_kv_cache = "int8" not in algorithm and decoder_type != "gpt"
         logging.info(f'{"Enabled" if enable_quant_kv_cache else "Disabled"} KV cache quantization')
         quant_cfg["quant_cfg"]["*output_quantizer"] = {
             "num_bits": 8 if algorithm == "int8_sq" else (4, 3),
@@ -212,7 +212,7 @@ def quantize(self, model: MegatronParallel, forward_loop=None):
 
         unwrapped_model = mtq.quantize(unwrapped_model, quant_cfg, forward_loop)
 
-        if decoder_type == "gptnext":
+        if decoder_type == "gpt":
             # We found squared_relu may have an under-calibration problem.
             # Clamp the scaling_factor with a min threshold to avoid under-calibration.
             match algorithm:
diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py
index 6771e91e7e6d..1eadd2a42a72 100644
--- a/nemo/collections/llm/quantization/utils.py
+++ b/nemo/collections/llm/quantization/utils.py
@@ -33,10 +33,10 @@ def get_modelopt_decoder_type(model: llm.GPTModel) -> str:
         (llm.LlamaModel, "llama"),
         (llm.MistralModel, "llama"),
         (llm.MixtralModel, "llama"),
-        (llm.NemotronModel, "gptnext"),
+        (llm.NemotronModel, "gpt"),
         (llm.Qwen2Model, "qwen"),
-        (llm.StarcoderModel, "gptnext"),
-        (llm.Starcoder2Model, "gptnext"),
+        (llm.StarcoderModel, "gpt"),
+        (llm.Starcoder2Model, "gpt"),
         (llm.Phi3Model, "phi3"),
     ]
 
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index cbf3ea39921e..aa1501c67bef 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -120,7 +120,7 @@ def __init__(self, quantization_config: Optional[DictConfig], export_config: Opt
             enable_quant_kv_cache = quantization_config.get("enable_kv_cache", None)
             if enable_quant_kv_cache is None:
                 enable_quant_kv_cache = (
-                    "int8" not in quantization_config.algorithm and quantization_config.decoder_type != "gptnext"
+                    "int8" not in quantization_config.algorithm and quantization_config.decoder_type != "gpt"
                 )
             logging.info(f'{"Enabled" if enable_quant_kv_cache else "Disabled"} KV cache quantization')
             quant_cfg["quant_cfg"]["*output_quantizer"] = {
@@ -200,7 +200,7 @@ def quantize(self, model: MegatronGPTModel, forward_loop: Callable[[MegatronGPTM
 
         model = mtq.quantize(model, self.quant_cfg, forward_loop)
 
-        if self.quantization_config.decoder_type == "gptnext":
+        if self.quantization_config.decoder_type == "gpt":
             # We found squared_relu may have an under-calibration problem.
             # Clamp the scaling_factor with a min threshold to avoid under-calibration.
             maxbound = 0

From 7c84a438d8167e68286bffa8cc331bfccfdf0633 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Mon, 9 Dec 2024 15:32:44 +0100
Subject: [PATCH 3/3] Fix too long lines

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 nemo/collections/llm/quantization/utils.py | 4 ++--
 nemo/export/quantize/quantizer.py          | 7 ++-----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py
index 1eadd2a42a72..13859260f3e2 100644
--- a/nemo/collections/llm/quantization/utils.py
+++ b/nemo/collections/llm/quantization/utils.py
@@ -59,8 +59,8 @@ def quantizable_model_config(model_cfg: llm.GPTConfig) -> llm.GPTConfig:
     if model_cfg.sequence_parallel:
         logging.warning("Disabling sequence parallelism for quantization...")
         model_cfg.sequence_parallel = False
-    # Only custom ModelOpt spec is supported for Quantization: this custom spec is largely based on local Megatron-LM
-    # layer definitions to avoid Transformer Engine implementations that are currently not supported.
+    # Only custom ModelOpt spec is supported for quantization: this custom spec is largely based on local
+    # Megatron-LM layer definitions to avoid Transformer Engine implementations that are currently not supported.
     # This layer spec also requires RoPE fusion to be disabled for tensor view operations in attention
     # layer implementation from megatron/core/transformer/dot_product_attention.py to be functional.
     model_cfg.name = "modelopt"
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index aa1501c67bef..711b83f7a5a9 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -164,10 +164,6 @@ def modify_model_config(model_cfg: DictConfig) -> DictConfig:
             if model_cfg.get("sequence_parallel", False):
                 logging.warning("Disabling sequence parallelism for quantization...")
                 model_cfg.sequence_parallel = False
-            # Only custom ModelOpt spec is supported for Quantization: this custom spec is largely based on local Megatron-LM
-            # layer definitions to avoid Transformer Engine implementations that are currently not supported.
-            # This layer spec also requires RoPE fusion to be disabled for tensor view operations in attention
-            # layer implementation from megatron/core/transformer/dot_product_attention.py to be functional.
             model_cfg.name = "modelopt"
             model_cfg.apply_rope_fusion = False
 
@@ -248,7 +244,8 @@ def export(self, model: MegatronGPTModel):
             )
             dist.barrier()  # Wait until all ranks complete export_model_config step
             logging.info(
-                f"Exporting quantized weights, model artifacts, and tokenizer config to {self.export_config.save_path}..."
+                "Exporting quantized weights, model artifacts,"
+                f" and tokenizer config to {self.export_config.save_path}..."
             )
             if dist.get_rank() == 0:
                 save_artifacts(model, export_dir)