From 5cc59db37763d26e3eeeb342a51341d1d67086e5 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Mon, 9 Dec 2024 13:00:43 +0100 Subject: [PATCH 1/2] Pass number of experts to modelopt layer spec Signed-off-by: Jan Lasek --- nemo/collections/llm/quantization/utils.py | 2 +- .../megatron/gpt_layer_modelopt_spec.py | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py index 57022d9d3e98..6771e91e7e6d 100644 --- a/nemo/collections/llm/quantization/utils.py +++ b/nemo/collections/llm/quantization/utils.py @@ -55,7 +55,7 @@ def quantizable_model_config(model_cfg: llm.GPTConfig) -> llm.GPTConfig: get_gpt_layer_modelopt_spec, ) - model_cfg.transformer_layer_spec = get_gpt_layer_modelopt_spec() + model_cfg.transformer_layer_spec = get_gpt_layer_modelopt_spec(num_experts=model_cfg.num_moe_experts) if model_cfg.sequence_parallel: logging.warning("Disabling sequence parallelism for quantization...") model_cfg.sequence_parallel = False diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py index 046e032093b1..514ef62a9ff3 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + try: from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add @@ -37,7 +39,7 @@ # Use this spec for Model Optimizer PTQ and TensorRT-LLM export -def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec: +def get_gpt_layer_modelopt_spec(num_experts: Optional[int] = None) -> ModuleSpec: """Mix the native spec with TENorm and TEDotProductAttention. This is essentially the native local spec except for the layernorm implementation @@ -45,6 +47,12 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec: prevents the apex dependency. TEDotProductAttention is used to support sliding window attention. + + Args: + num_experts (int): Number of experts. Defaults to None. + + Returns: + ModuleSpec: Module specification with Megatron-Core modules. """ if not HAVE_MEGATRON_CORE: raise IMPORT_ERROR @@ -79,7 +87,7 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec: # Helper function to get module spec for MLP/MoE -def _get_mlp_module_spec(num_experts: int = None) -> ModuleSpec: +def _get_mlp_module_spec(num_experts: Optional[int] = None) -> ModuleSpec: if num_experts is None: # Dense MLP w/ or w/o TE modules. return ModuleSpec( From 0387f5512663d8d6fa7fe2382b94b190963cd297 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Mon, 9 Dec 2024 15:32:44 +0100 Subject: [PATCH 2/2] Fix too long lines Signed-off-by: Jan Lasek --- nemo/collections/llm/quantization/utils.py | 4 ++-- nemo/export/quantize/quantizer.py | 7 ++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py index 6771e91e7e6d..aa7fa61f1b38 100644 --- a/nemo/collections/llm/quantization/utils.py +++ b/nemo/collections/llm/quantization/utils.py @@ -59,8 +59,8 @@ def quantizable_model_config(model_cfg: llm.GPTConfig) -> llm.GPTConfig: if model_cfg.sequence_parallel: logging.warning("Disabling sequence parallelism for quantization...") model_cfg.sequence_parallel = False - # Only custom ModelOpt spec is supported for Quantization: this custom spec is largely based on local Megatron-LM - # layer definitions to avoid Transformer Engine implementations that are currently not supported. + # Only custom ModelOpt spec is supported for quantization: this custom spec is largely based on local + # Megatron-LM layer definitions to avoid Transformer Engine implementations that are currently not supported. # This layer spec also requires RoPE fusion to be disabled for tensor view operations in attention # layer implementation from megatron/core/transformer/dot_product_attention.py to be functional. model_cfg.name = "modelopt" diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index cbf3ea39921e..6f7027f12be8 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -164,10 +164,6 @@ def modify_model_config(model_cfg: DictConfig) -> DictConfig: if model_cfg.get("sequence_parallel", False): logging.warning("Disabling sequence parallelism for quantization...") model_cfg.sequence_parallel = False - # Only custom ModelOpt spec is supported for Quantization: this custom spec is largely based on local Megatron-LM - # layer definitions to avoid Transformer Engine implementations that are currently not supported. - # This layer spec also requires RoPE fusion to be disabled for tensor view operations in attention - # layer implementation from megatron/core/transformer/dot_product_attention.py to be functional. model_cfg.name = "modelopt" model_cfg.apply_rope_fusion = False @@ -248,7 +244,8 @@ def export(self, model: MegatronGPTModel): ) dist.barrier() # Wait until all ranks complete export_model_config step logging.info( - f"Exporting quantized weights, model artifacts, and tokenizer config to {self.export_config.save_path}..." + "Exporting quantized weights, model artifacts," + f" and tokenizer config to {self.export_config.save_path}..." ) if dist.get_rank() == 0: save_artifacts(model, export_dir)