From 5cc59db37763d26e3eeeb342a51341d1d67086e5 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Mon, 9 Dec 2024 13:00:43 +0100
Subject: [PATCH 1/2] Pass number of experts to modelopt layer spec

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 nemo/collections/llm/quantization/utils.py           |  2 +-
 .../megatron/gpt_layer_modelopt_spec.py              | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py
index 57022d9d3e98..6771e91e7e6d 100644
--- a/nemo/collections/llm/quantization/utils.py
+++ b/nemo/collections/llm/quantization/utils.py
@@ -55,7 +55,7 @@ def quantizable_model_config(model_cfg: llm.GPTConfig) -> llm.GPTConfig:
         get_gpt_layer_modelopt_spec,
     )
 
-    model_cfg.transformer_layer_spec = get_gpt_layer_modelopt_spec()
+    model_cfg.transformer_layer_spec = get_gpt_layer_modelopt_spec(num_experts=model_cfg.num_moe_experts)
     if model_cfg.sequence_parallel:
         logging.warning("Disabling sequence parallelism for quantization...")
         model_cfg.sequence_parallel = False
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
index 046e032093b1..514ef62a9ff3 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Optional
+
 try:
     from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm
     from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
@@ -37,7 +39,7 @@
 
 
 # Use this spec for Model Optimizer PTQ and TensorRT-LLM export
-def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
+def get_gpt_layer_modelopt_spec(num_experts: Optional[int] = None) -> ModuleSpec:
     """Mix the native spec with TENorm and TEDotProductAttention.
 
     This is essentially the native local spec except for the layernorm implementation
@@ -45,6 +47,12 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
     prevents the apex dependency.
 
     TEDotProductAttention is used to support sliding window attention.
+
+    Args:
+        num_experts (int): Number of experts. Defaults to None.
+
+    Returns:
+        ModuleSpec: Module specification with Megatron-Core modules.
     """
     if not HAVE_MEGATRON_CORE:
         raise IMPORT_ERROR
@@ -79,7 +87,7 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
 
 
 # Helper function to get module spec for MLP/MoE
-def _get_mlp_module_spec(num_experts: int = None) -> ModuleSpec:
+def _get_mlp_module_spec(num_experts: Optional[int] = None) -> ModuleSpec:
     if num_experts is None:
         # Dense MLP w/ or w/o TE modules.
         return ModuleSpec(

From 0387f5512663d8d6fa7fe2382b94b190963cd297 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Mon, 9 Dec 2024 15:32:44 +0100
Subject: [PATCH 2/2] Fix too long lines

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 nemo/collections/llm/quantization/utils.py | 4 ++--
 nemo/export/quantize/quantizer.py          | 7 ++-----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py
index 6771e91e7e6d..aa7fa61f1b38 100644
--- a/nemo/collections/llm/quantization/utils.py
+++ b/nemo/collections/llm/quantization/utils.py
@@ -59,8 +59,8 @@ def quantizable_model_config(model_cfg: llm.GPTConfig) -> llm.GPTConfig:
     if model_cfg.sequence_parallel:
         logging.warning("Disabling sequence parallelism for quantization...")
         model_cfg.sequence_parallel = False
-    # Only custom ModelOpt spec is supported for Quantization: this custom spec is largely based on local Megatron-LM
-    # layer definitions to avoid Transformer Engine implementations that are currently not supported.
+    # Only custom ModelOpt spec is supported for quantization: this custom spec is largely based on local
+    # Megatron-LM layer definitions to avoid Transformer Engine implementations that are currently not supported.
     # This layer spec also requires RoPE fusion to be disabled for tensor view operations in attention
     # layer implementation from megatron/core/transformer/dot_product_attention.py to be functional.
     model_cfg.name = "modelopt"
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index cbf3ea39921e..6f7027f12be8 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -164,10 +164,6 @@ def modify_model_config(model_cfg: DictConfig) -> DictConfig:
             if model_cfg.get("sequence_parallel", False):
                 logging.warning("Disabling sequence parallelism for quantization...")
                 model_cfg.sequence_parallel = False
-            # Only custom ModelOpt spec is supported for Quantization: this custom spec is largely based on local Megatron-LM
-            # layer definitions to avoid Transformer Engine implementations that are currently not supported.
-            # This layer spec also requires RoPE fusion to be disabled for tensor view operations in attention
-            # layer implementation from megatron/core/transformer/dot_product_attention.py to be functional.
             model_cfg.name = "modelopt"
             model_cfg.apply_rope_fusion = False
 
@@ -248,7 +244,8 @@ def export(self, model: MegatronGPTModel):
             )
             dist.barrier()  # Wait until all ranks complete export_model_config step
             logging.info(
-                f"Exporting quantized weights, model artifacts, and tokenizer config to {self.export_config.save_path}..."
+                "Exporting quantized weights, model artifacts,"
+                f" and tokenizer config to {self.export_config.save_path}..."
             )
             if dist.get_rank() == 0:
                 save_artifacts(model, export_dir)