diff --git a/Dockerfile.ci b/Dockerfile.ci index 1d4173f9689c..e93d00d03195 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea -ARG MODELOPT_VERSION=0.19.0 +ARG MODELOPT_VERSION=0.21.0 ARG MCORE_TAG=bd677bfb13ac2f19deaa927adc6da6f9201d66aa ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml index 62f0e452d3b5..ff8d8ca7c944 100644 --- a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml @@ -32,7 +32,7 @@ model: activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' quantization: - decoder_type: ${export.decoder_type} # gptnext, gpt2, llama + decoder_type: ${export.decoder_type} # gpt, llama algorithm: fp8 # null, int8_sq, fp8, int4_awq calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset num_calib_size: 512 # number of samples used for calibration @@ -41,7 +41,7 @@ quantization: enable_kv_cache: null # Enable FP8 KV cache quantization. Set to null for automatic selection. export: - decoder_type: llama # gptnext, gpt2, llama + decoder_type: llama # gpt, llama inference_tensor_parallel: 1 # Default using 1 TP for inference inference_pipeline_parallel: 1 # Default using 1 PP for inference dtype: 16 # Default precision data type for non-quantized layers: 16 or bf16 diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml index 09e00f8be110..35b0257b743b 100644 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml @@ -190,7 +190,7 @@ model: reduce_on_plateau: false quantization: - decoder_type: ${export.decoder_type} # gptnext, gpt2, llama + decoder_type: ${export.decoder_type} # gpt, llama algorithm: int4 # null, int8_sq, fp8, int4_awq, int4 num_calib_size: 512 # number of samples used for calibration awq_block_size: 128 # block size for scaling factors (only used in AWQ algorithms) @@ -198,7 +198,7 @@ quantization: enable_kv_cache: false # Enable FP8 KV cache quantization. Set to null for automatic selection. export: - decoder_type: llama # gptnext, gpt2, llama + decoder_type: llama # gpt, llama inference_tensor_parallel: 1 # Default using 1 TP for inference inference_pipeline_parallel: 1 # Default using 1 PP for inference dtype: ${trainer.precision} # Default precision data type diff --git a/nemo/collections/llm/quantization/quantizer.py b/nemo/collections/llm/quantization/quantizer.py index 4779cc3915c8..16ae1319e733 100644 --- a/nemo/collections/llm/quantization/quantizer.py +++ b/nemo/collections/llm/quantization/quantizer.py @@ -198,7 +198,7 @@ def quantize(self, model: MegatronParallel, forward_loop=None): # TODO: Investigate why enabling FP8 kv cache will cause accuracy regressions for Nemotron. enable_quant_kv_cache = self.quantization_config.enable_kv_cache if enable_quant_kv_cache is None: - enable_quant_kv_cache = "int8" not in algorithm and decoder_type != "gptnext" + enable_quant_kv_cache = "int8" not in algorithm and decoder_type != "gpt" logging.info(f'{"Enabled" if enable_quant_kv_cache else "Disabled"} KV cache quantization') quant_cfg["quant_cfg"]["*output_quantizer"] = { "num_bits": 8 if algorithm == "int8_sq" else (4, 3), @@ -212,7 +212,7 @@ def quantize(self, model: MegatronParallel, forward_loop=None): unwrapped_model = mtq.quantize(unwrapped_model, quant_cfg, forward_loop) - if decoder_type == "gptnext": + if decoder_type == "gpt": # We found squared_relu may have an under-calibration problem. # Clamp the scaling_factor with a min threshold to avoid under-calibration. match algorithm: diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py index aa7fa61f1b38..13859260f3e2 100644 --- a/nemo/collections/llm/quantization/utils.py +++ b/nemo/collections/llm/quantization/utils.py @@ -33,10 +33,10 @@ def get_modelopt_decoder_type(model: llm.GPTModel) -> str: (llm.LlamaModel, "llama"), (llm.MistralModel, "llama"), (llm.MixtralModel, "llama"), - (llm.NemotronModel, "gptnext"), + (llm.NemotronModel, "gpt"), (llm.Qwen2Model, "qwen"), - (llm.StarcoderModel, "gptnext"), - (llm.Starcoder2Model, "gptnext"), + (llm.StarcoderModel, "gpt"), + (llm.Starcoder2Model, "gpt"), (llm.Phi3Model, "phi3"), ] diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index 6f7027f12be8..711b83f7a5a9 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -120,7 +120,7 @@ def __init__(self, quantization_config: Optional[DictConfig], export_config: Opt enable_quant_kv_cache = quantization_config.get("enable_kv_cache", None) if enable_quant_kv_cache is None: enable_quant_kv_cache = ( - "int8" not in quantization_config.algorithm and quantization_config.decoder_type != "gptnext" + "int8" not in quantization_config.algorithm and quantization_config.decoder_type != "gpt" ) logging.info(f'{"Enabled" if enable_quant_kv_cache else "Disabled"} KV cache quantization') quant_cfg["quant_cfg"]["*output_quantizer"] = { @@ -196,7 +196,7 @@ def quantize(self, model: MegatronGPTModel, forward_loop: Callable[[MegatronGPTM model = mtq.quantize(model, self.quant_cfg, forward_loop) - if self.quantization_config.decoder_type == "gptnext": + if self.quantization_config.decoder_type == "gpt": # We found squared_relu may have an under-calibration problem. # Clamp the scaling_factor with a min threshold to avoid under-calibration. maxbound = 0