NVIDIA-NeMo · janekl · Dec 31, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
-ARG MODELOPT_VERSION=0.19.0
+ARG MODELOPT_VERSION=0.21.0
 ARG MCORE_TAG=bd677bfb13ac2f19deaa927adc6da6f9201d66aa
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
@@ -32,7 +32,7 @@ model:
   activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
 
 quantization:
-  decoder_type: ${export.decoder_type} # gptnext, gpt2, llama
+  decoder_type: ${export.decoder_type} # gpt, llama
   algorithm: fp8 # null, int8_sq, fp8, int4_awq
   calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset
   num_calib_size: 512 # number of samples used for calibration
@@ -41,7 +41,7 @@ quantization:
   enable_kv_cache: null # Enable FP8 KV cache quantization. Set to null for automatic selection.
 
 export:
-  decoder_type: llama # gptnext, gpt2, llama
+  decoder_type: llama # gpt, llama
   inference_tensor_parallel: 1 # Default using 1 TP for inference
   inference_pipeline_parallel: 1 # Default using 1 PP for inference
   dtype: 16 # Default precision data type for non-quantized layers: 16 or bf16

diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml
@@ -190,15 +190,15 @@ model:
       reduce_on_plateau: false
 
 quantization:
-  decoder_type: ${export.decoder_type} # gptnext, gpt2, llama
+  decoder_type: ${export.decoder_type} # gpt, llama
   algorithm: int4 # null, int8_sq, fp8, int4_awq, int4
   num_calib_size: 512 # number of samples used for calibration
   awq_block_size: 128 # block size for scaling factors (only used in AWQ algorithms)
   sq_alpha: 1.0 # alpha parameter (only used in SmoothQuant algorithms)
   enable_kv_cache: false # Enable FP8 KV cache quantization. Set to null for automatic selection.
 
 export:
-  decoder_type: llama # gptnext, gpt2, llama
+  decoder_type: llama # gpt, llama
   inference_tensor_parallel: 1 # Default using 1 TP for inference
   inference_pipeline_parallel: 1 # Default using 1 PP for inference
   dtype: ${trainer.precision} # Default precision data type

diff --git a/nemo/collections/llm/quantization/quantizer.py b/nemo/collections/llm/quantization/quantizer.py
@@ -198,7 +198,7 @@ def quantize(self, model: MegatronParallel, forward_loop=None):
         # TODO: Investigate why enabling FP8 kv cache will cause accuracy regressions for Nemotron.
         enable_quant_kv_cache = self.quantization_config.enable_kv_cache
         if enable_quant_kv_cache is None:
-            enable_quant_kv_cache = "int8" not in algorithm and decoder_type != "gptnext"
+            enable_quant_kv_cache = "int8" not in algorithm and decoder_type != "gpt"
         logging.info(f'{"Enabled" if enable_quant_kv_cache else "Disabled"} KV cache quantization')
         quant_cfg["quant_cfg"]["*output_quantizer"] = {
             "num_bits": 8 if algorithm == "int8_sq" else (4, 3),
@@ -212,7 +212,7 @@ def quantize(self, model: MegatronParallel, forward_loop=None):
 
         unwrapped_model = mtq.quantize(unwrapped_model, quant_cfg, forward_loop)
 
-        if decoder_type == "gptnext":
+        if decoder_type == "gpt":
             # We found squared_relu may have an under-calibration problem.
             # Clamp the scaling_factor with a min threshold to avoid under-calibration.
             match algorithm:

diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py
@@ -33,10 +33,10 @@ def get_modelopt_decoder_type(model: llm.GPTModel) -> str:
         (llm.LlamaModel, "llama"),
         (llm.MistralModel, "llama"),
         (llm.MixtralModel, "llama"),
-        (llm.NemotronModel, "gptnext"),
+        (llm.NemotronModel, "gpt"),
         (llm.Qwen2Model, "qwen"),
-        (llm.StarcoderModel, "gptnext"),
-        (llm.Starcoder2Model, "gptnext"),
+        (llm.StarcoderModel, "gpt"),
+        (llm.Starcoder2Model, "gpt"),
         (llm.Phi3Model, "phi3"),
     ]
 

diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
@@ -120,7 +120,7 @@ def __init__(self, quantization_config: Optional[DictConfig], export_config: Opt
             enable_quant_kv_cache = quantization_config.get("enable_kv_cache", None)
             if enable_quant_kv_cache is None:
                 enable_quant_kv_cache = (
-                    "int8" not in quantization_config.algorithm and quantization_config.decoder_type != "gptnext"
+                    "int8" not in quantization_config.algorithm and quantization_config.decoder_type != "gpt"
                 )
             logging.info(f'{"Enabled" if enable_quant_kv_cache else "Disabled"} KV cache quantization')
             quant_cfg["quant_cfg"]["*output_quantizer"] = {
@@ -196,7 +196,7 @@ def quantize(self, model: MegatronGPTModel, forward_loop: Callable[[MegatronGPTM
 
         model = mtq.quantize(model, self.quant_cfg, forward_loop)
 
-        if self.quantization_config.decoder_type == "gptnext":
+        if self.quantization_config.decoder_type == "gpt":
             # We found squared_relu may have an under-calibration problem.
             # Clamp the scaling_factor with a min threshold to avoid under-calibration.
             maxbound = 0