From c8ca1de6384b10bede39d4158cd2cf2e97d1cb6f Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Mon, 9 Dec 2024 13:00:43 +0100 Subject: [PATCH 1/3] Pass number of experts to modelopt layer spec Signed-off-by: Jan Lasek --- nemo/collections/llm/quantization/utils.py | 2 +- .../megatron/gpt_layer_modelopt_spec.py | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py index 57022d9d3e98..6771e91e7e6d 100644 --- a/nemo/collections/llm/quantization/utils.py +++ b/nemo/collections/llm/quantization/utils.py @@ -55,7 +55,7 @@ def quantizable_model_config(model_cfg: llm.GPTConfig) -> llm.GPTConfig: get_gpt_layer_modelopt_spec, ) - model_cfg.transformer_layer_spec = get_gpt_layer_modelopt_spec() + model_cfg.transformer_layer_spec = get_gpt_layer_modelopt_spec(num_experts=model_cfg.num_moe_experts) if model_cfg.sequence_parallel: logging.warning("Disabling sequence parallelism for quantization...") model_cfg.sequence_parallel = False diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py index 046e032093b1..514ef62a9ff3 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + try: from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add @@ -37,7 +39,7 @@ # Use this spec for Model Optimizer PTQ and TensorRT-LLM export -def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec: +def get_gpt_layer_modelopt_spec(num_experts: Optional[int] = None) -> ModuleSpec: """Mix the native spec with TENorm and TEDotProductAttention. This is essentially the native local spec except for the layernorm implementation @@ -45,6 +47,12 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec: prevents the apex dependency. TEDotProductAttention is used to support sliding window attention. + + Args: + num_experts (int): Number of experts. Defaults to None. + + Returns: + ModuleSpec: Module specification with Megatron-Core modules. """ if not HAVE_MEGATRON_CORE: raise IMPORT_ERROR @@ -79,7 +87,7 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec: # Helper function to get module spec for MLP/MoE -def _get_mlp_module_spec(num_experts: int = None) -> ModuleSpec: +def _get_mlp_module_spec(num_experts: Optional[int] = None) -> ModuleSpec: if num_experts is None: # Dense MLP w/ or w/o TE modules. return ModuleSpec( From 876c4303c79b7be9177641ce128f3adbc20d238c Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Mon, 9 Dec 2024 15:25:23 +0100 Subject: [PATCH 2/3] modelopt 0.21.0 update Signed-off-by: Jan Lasek --- Dockerfile.ci | 2 +- examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml | 4 ++-- .../tuning/conf/megatron_gpt_qat_config.yaml | 4 ++-- nemo/collections/llm/quantization/quantizer.py | 4 ++-- nemo/collections/llm/quantization/utils.py | 6 +++--- nemo/export/quantize/quantizer.py | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Dockerfile.ci b/Dockerfile.ci index 1d4173f9689c..e93d00d03195 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea -ARG MODELOPT_VERSION=0.19.0 +ARG MODELOPT_VERSION=0.21.0 ARG MCORE_TAG=bd677bfb13ac2f19deaa927adc6da6f9201d66aa ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml index 62f0e452d3b5..ff8d8ca7c944 100644 --- a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml @@ -32,7 +32,7 @@ model: activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' quantization: - decoder_type: ${export.decoder_type} # gptnext, gpt2, llama + decoder_type: ${export.decoder_type} # gpt, llama algorithm: fp8 # null, int8_sq, fp8, int4_awq calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset num_calib_size: 512 # number of samples used for calibration @@ -41,7 +41,7 @@ quantization: enable_kv_cache: null # Enable FP8 KV cache quantization. Set to null for automatic selection. export: - decoder_type: llama # gptnext, gpt2, llama + decoder_type: llama # gpt, llama inference_tensor_parallel: 1 # Default using 1 TP for inference inference_pipeline_parallel: 1 # Default using 1 PP for inference dtype: 16 # Default precision data type for non-quantized layers: 16 or bf16 diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml index 09e00f8be110..35b0257b743b 100644 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml @@ -190,7 +190,7 @@ model: reduce_on_plateau: false quantization: - decoder_type: ${export.decoder_type} # gptnext, gpt2, llama + decoder_type: ${export.decoder_type} # gpt, llama algorithm: int4 # null, int8_sq, fp8, int4_awq, int4 num_calib_size: 512 # number of samples used for calibration awq_block_size: 128 # block size for scaling factors (only used in AWQ algorithms) @@ -198,7 +198,7 @@ quantization: enable_kv_cache: false # Enable FP8 KV cache quantization. Set to null for automatic selection. export: - decoder_type: llama # gptnext, gpt2, llama + decoder_type: llama # gpt, llama inference_tensor_parallel: 1 # Default using 1 TP for inference inference_pipeline_parallel: 1 # Default using 1 PP for inference dtype: ${trainer.precision} # Default precision data type diff --git a/nemo/collections/llm/quantization/quantizer.py b/nemo/collections/llm/quantization/quantizer.py index 4779cc3915c8..16ae1319e733 100644 --- a/nemo/collections/llm/quantization/quantizer.py +++ b/nemo/collections/llm/quantization/quantizer.py @@ -198,7 +198,7 @@ def quantize(self, model: MegatronParallel, forward_loop=None): # TODO: Investigate why enabling FP8 kv cache will cause accuracy regressions for Nemotron. enable_quant_kv_cache = self.quantization_config.enable_kv_cache if enable_quant_kv_cache is None: - enable_quant_kv_cache = "int8" not in algorithm and decoder_type != "gptnext" + enable_quant_kv_cache = "int8" not in algorithm and decoder_type != "gpt" logging.info(f'{"Enabled" if enable_quant_kv_cache else "Disabled"} KV cache quantization') quant_cfg["quant_cfg"]["*output_quantizer"] = { "num_bits": 8 if algorithm == "int8_sq" else (4, 3), @@ -212,7 +212,7 @@ def quantize(self, model: MegatronParallel, forward_loop=None): unwrapped_model = mtq.quantize(unwrapped_model, quant_cfg, forward_loop) - if decoder_type == "gptnext": + if decoder_type == "gpt": # We found squared_relu may have an under-calibration problem. # Clamp the scaling_factor with a min threshold to avoid under-calibration. match algorithm: diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py index 6771e91e7e6d..1eadd2a42a72 100644 --- a/nemo/collections/llm/quantization/utils.py +++ b/nemo/collections/llm/quantization/utils.py @@ -33,10 +33,10 @@ def get_modelopt_decoder_type(model: llm.GPTModel) -> str: (llm.LlamaModel, "llama"), (llm.MistralModel, "llama"), (llm.MixtralModel, "llama"), - (llm.NemotronModel, "gptnext"), + (llm.NemotronModel, "gpt"), (llm.Qwen2Model, "qwen"), - (llm.StarcoderModel, "gptnext"), - (llm.Starcoder2Model, "gptnext"), + (llm.StarcoderModel, "gpt"), + (llm.Starcoder2Model, "gpt"), (llm.Phi3Model, "phi3"), ] diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index cbf3ea39921e..aa1501c67bef 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -120,7 +120,7 @@ def __init__(self, quantization_config: Optional[DictConfig], export_config: Opt enable_quant_kv_cache = quantization_config.get("enable_kv_cache", None) if enable_quant_kv_cache is None: enable_quant_kv_cache = ( - "int8" not in quantization_config.algorithm and quantization_config.decoder_type != "gptnext" + "int8" not in quantization_config.algorithm and quantization_config.decoder_type != "gpt" ) logging.info(f'{"Enabled" if enable_quant_kv_cache else "Disabled"} KV cache quantization') quant_cfg["quant_cfg"]["*output_quantizer"] = { @@ -200,7 +200,7 @@ def quantize(self, model: MegatronGPTModel, forward_loop: Callable[[MegatronGPTM model = mtq.quantize(model, self.quant_cfg, forward_loop) - if self.quantization_config.decoder_type == "gptnext": + if self.quantization_config.decoder_type == "gpt": # We found squared_relu may have an under-calibration problem. # Clamp the scaling_factor with a min threshold to avoid under-calibration. maxbound = 0 From 7c84a438d8167e68286bffa8cc331bfccfdf0633 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Mon, 9 Dec 2024 15:32:44 +0100 Subject: [PATCH 3/3] Fix too long lines Signed-off-by: Jan Lasek --- nemo/collections/llm/quantization/utils.py | 4 ++-- nemo/export/quantize/quantizer.py | 7 ++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py index 1eadd2a42a72..13859260f3e2 100644 --- a/nemo/collections/llm/quantization/utils.py +++ b/nemo/collections/llm/quantization/utils.py @@ -59,8 +59,8 @@ def quantizable_model_config(model_cfg: llm.GPTConfig) -> llm.GPTConfig: if model_cfg.sequence_parallel: logging.warning("Disabling sequence parallelism for quantization...") model_cfg.sequence_parallel = False - # Only custom ModelOpt spec is supported for Quantization: this custom spec is largely based on local Megatron-LM - # layer definitions to avoid Transformer Engine implementations that are currently not supported. + # Only custom ModelOpt spec is supported for quantization: this custom spec is largely based on local + # Megatron-LM layer definitions to avoid Transformer Engine implementations that are currently not supported. # This layer spec also requires RoPE fusion to be disabled for tensor view operations in attention # layer implementation from megatron/core/transformer/dot_product_attention.py to be functional. model_cfg.name = "modelopt" diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index aa1501c67bef..711b83f7a5a9 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -164,10 +164,6 @@ def modify_model_config(model_cfg: DictConfig) -> DictConfig: if model_cfg.get("sequence_parallel", False): logging.warning("Disabling sequence parallelism for quantization...") model_cfg.sequence_parallel = False - # Only custom ModelOpt spec is supported for Quantization: this custom spec is largely based on local Megatron-LM - # layer definitions to avoid Transformer Engine implementations that are currently not supported. - # This layer spec also requires RoPE fusion to be disabled for tensor view operations in attention - # layer implementation from megatron/core/transformer/dot_product_attention.py to be functional. model_cfg.name = "modelopt" model_cfg.apply_rope_fusion = False @@ -248,7 +244,8 @@ def export(self, model: MegatronGPTModel): ) dist.barrier() # Wait until all ranks complete export_model_config step logging.info( - f"Exporting quantized weights, model artifacts, and tokenizer config to {self.export_config.save_path}..." + "Exporting quantized weights, model artifacts," + f" and tokenizer config to {self.export_config.save_path}..." ) if dist.get_rank() == 0: save_artifacts(model, export_dir)