NVIDIA-NeMo · janekl · Dec 16, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py
@@ -55,12 +55,12 @@ def quantizable_model_config(model_cfg: llm.GPTConfig) -> llm.GPTConfig:
         get_gpt_layer_modelopt_spec,
     )
 
-    model_cfg.transformer_layer_spec = get_gpt_layer_modelopt_spec()
+    model_cfg.transformer_layer_spec = get_gpt_layer_modelopt_spec(num_experts=model_cfg.num_moe_experts)
     if model_cfg.sequence_parallel:
         logging.warning("Disabling sequence parallelism for quantization...")
         model_cfg.sequence_parallel = False
-    # Only custom ModelOpt spec is supported for Quantization: this custom spec is largely based on local Megatron-LM
-    # layer definitions to avoid Transformer Engine implementations that are currently not supported.
+    # Only custom ModelOpt spec is supported for quantization: this custom spec is largely based on local
+    # Megatron-LM layer definitions to avoid Transformer Engine implementations that are currently not supported.
     # This layer spec also requires RoPE fusion to be disabled for tensor view operations in attention
     # layer implementation from megatron/core/transformer/dot_product_attention.py to be functional.
     model_cfg.name = "modelopt"

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Optional
+
 try:
     from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm
     from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
@@ -37,14 +39,20 @@
 
 
 # Use this spec for Model Optimizer PTQ and TensorRT-LLM export
-def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
+def get_gpt_layer_modelopt_spec(num_experts: Optional[int] = None) -> ModuleSpec:
     """Mix the native spec with TENorm and TEDotProductAttention.
 
     This is essentially the native local spec except for the layernorm implementation
     is using TENorm from Transformer-Engine. This TENorm supports both FusedLayerNorm and RMSNorm and
     prevents the apex dependency.
 
     TEDotProductAttention is used to support sliding window attention.
+
+    Args:
+        num_experts (int): Number of experts. Defaults to None.
+
+    Returns:
+        ModuleSpec: Module specification with Megatron-Core modules.
     """
     if not HAVE_MEGATRON_CORE:
         raise IMPORT_ERROR
@@ -79,7 +87,7 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
 
 
 # Helper function to get module spec for MLP/MoE
-def _get_mlp_module_spec(num_experts: int = None) -> ModuleSpec:
+def _get_mlp_module_spec(num_experts: Optional[int] = None) -> ModuleSpec:
     if num_experts is None:
         # Dense MLP w/ or w/o TE modules.
         return ModuleSpec(

diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
@@ -164,10 +164,6 @@ def modify_model_config(model_cfg: DictConfig) -> DictConfig:
             if model_cfg.get("sequence_parallel", False):
                 logging.warning("Disabling sequence parallelism for quantization...")
                 model_cfg.sequence_parallel = False
-            # Only custom ModelOpt spec is supported for Quantization: this custom spec is largely based on local Megatron-LM
-            # layer definitions to avoid Transformer Engine implementations that are currently not supported.
-            # This layer spec also requires RoPE fusion to be disabled for tensor view operations in attention
-            # layer implementation from megatron/core/transformer/dot_product_attention.py to be functional.
             model_cfg.name = "modelopt"
             model_cfg.apply_rope_fusion = False
 
@@ -248,7 +244,8 @@ def export(self, model: MegatronGPTModel):
             )
             dist.barrier()  # Wait until all ranks complete export_model_config step
             logging.info(
-                f"Exporting quantized weights, model artifacts, and tokenizer config to {self.export_config.save_path}..."
+                "Exporting quantized weights, model artifacts,"
+                f" and tokenizer config to {self.export_config.save_path}..."
             )
             if dist.get_rank() == 0:
                 save_artifacts(model, export_dir)