diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py index 57022d9d3e98..aa7fa61f1b38 100644 --- a/nemo/collections/llm/quantization/utils.py +++ b/nemo/collections/llm/quantization/utils.py @@ -55,12 +55,12 @@ def quantizable_model_config(model_cfg: llm.GPTConfig) -> llm.GPTConfig: get_gpt_layer_modelopt_spec, ) - model_cfg.transformer_layer_spec = get_gpt_layer_modelopt_spec() + model_cfg.transformer_layer_spec = get_gpt_layer_modelopt_spec(num_experts=model_cfg.num_moe_experts) if model_cfg.sequence_parallel: logging.warning("Disabling sequence parallelism for quantization...") model_cfg.sequence_parallel = False - # Only custom ModelOpt spec is supported for Quantization: this custom spec is largely based on local Megatron-LM - # layer definitions to avoid Transformer Engine implementations that are currently not supported. + # Only custom ModelOpt spec is supported for quantization: this custom spec is largely based on local + # Megatron-LM layer definitions to avoid Transformer Engine implementations that are currently not supported. # This layer spec also requires RoPE fusion to be disabled for tensor view operations in attention # layer implementation from megatron/core/transformer/dot_product_attention.py to be functional. model_cfg.name = "modelopt" diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py index 046e032093b1..514ef62a9ff3 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + try: from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add @@ -37,7 +39,7 @@ # Use this spec for Model Optimizer PTQ and TensorRT-LLM export -def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec: +def get_gpt_layer_modelopt_spec(num_experts: Optional[int] = None) -> ModuleSpec: """Mix the native spec with TENorm and TEDotProductAttention. This is essentially the native local spec except for the layernorm implementation @@ -45,6 +47,12 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec: prevents the apex dependency. TEDotProductAttention is used to support sliding window attention. + + Args: + num_experts (int): Number of experts. Defaults to None. + + Returns: + ModuleSpec: Module specification with Megatron-Core modules. """ if not HAVE_MEGATRON_CORE: raise IMPORT_ERROR @@ -79,7 +87,7 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec: # Helper function to get module spec for MLP/MoE -def _get_mlp_module_spec(num_experts: int = None) -> ModuleSpec: +def _get_mlp_module_spec(num_experts: Optional[int] = None) -> ModuleSpec: if num_experts is None: # Dense MLP w/ or w/o TE modules. return ModuleSpec( diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index cbf3ea39921e..6f7027f12be8 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -164,10 +164,6 @@ def modify_model_config(model_cfg: DictConfig) -> DictConfig: if model_cfg.get("sequence_parallel", False): logging.warning("Disabling sequence parallelism for quantization...") model_cfg.sequence_parallel = False - # Only custom ModelOpt spec is supported for Quantization: this custom spec is largely based on local Megatron-LM - # layer definitions to avoid Transformer Engine implementations that are currently not supported. - # This layer spec also requires RoPE fusion to be disabled for tensor view operations in attention - # layer implementation from megatron/core/transformer/dot_product_attention.py to be functional. model_cfg.name = "modelopt" model_cfg.apply_rope_fusion = False @@ -248,7 +244,8 @@ def export(self, model: MegatronGPTModel): ) dist.barrier() # Wait until all ranks complete export_model_config step logging.info( - f"Exporting quantized weights, model artifacts, and tokenizer config to {self.export_config.save_path}..." + "Exporting quantized weights, model artifacts," + f" and tokenizer config to {self.export_config.save_path}..." ) if dist.get_rank() == 0: save_artifacts(model, export_dir)