Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T

# Install NeMo requirements
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG MODELOPT_VERSION=0.19.0
ARG MODELOPT_VERSION=0.21.0
ARG MCORE_TAG=bd677bfb13ac2f19deaa927adc6da6f9201d66aa

ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
Expand Down
4 changes: 2 additions & 2 deletions examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ model:
activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'

quantization:
decoder_type: ${export.decoder_type} # gptnext, gpt2, llama
decoder_type: ${export.decoder_type} # gpt, llama
algorithm: fp8 # null, int8_sq, fp8, int4_awq
calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset
num_calib_size: 512 # number of samples used for calibration
Expand All @@ -41,7 +41,7 @@ quantization:
enable_kv_cache: null # Enable FP8 KV cache quantization. Set to null for automatic selection.

export:
decoder_type: llama # gptnext, gpt2, llama
decoder_type: llama # gpt, llama
inference_tensor_parallel: 1 # Default using 1 TP for inference
inference_pipeline_parallel: 1 # Default using 1 PP for inference
dtype: 16 # Default precision data type for non-quantized layers: 16 or bf16
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,15 +190,15 @@ model:
reduce_on_plateau: false

quantization:
decoder_type: ${export.decoder_type} # gptnext, gpt2, llama
decoder_type: ${export.decoder_type} # gpt, llama
algorithm: int4 # null, int8_sq, fp8, int4_awq, int4
num_calib_size: 512 # number of samples used for calibration
awq_block_size: 128 # block size for scaling factors (only used in AWQ algorithms)
sq_alpha: 1.0 # alpha parameter (only used in SmoothQuant algorithms)
enable_kv_cache: false # Enable FP8 KV cache quantization. Set to null for automatic selection.

export:
decoder_type: llama # gptnext, gpt2, llama
decoder_type: llama # gpt, llama
inference_tensor_parallel: 1 # Default using 1 TP for inference
inference_pipeline_parallel: 1 # Default using 1 PP for inference
dtype: ${trainer.precision} # Default precision data type
Expand Down
4 changes: 2 additions & 2 deletions nemo/collections/llm/quantization/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def quantize(self, model: MegatronParallel, forward_loop=None):
# TODO: Investigate why enabling FP8 kv cache will cause accuracy regressions for Nemotron.
enable_quant_kv_cache = self.quantization_config.enable_kv_cache
if enable_quant_kv_cache is None:
enable_quant_kv_cache = "int8" not in algorithm and decoder_type != "gptnext"
enable_quant_kv_cache = "int8" not in algorithm and decoder_type != "gpt"
logging.info(f'{"Enabled" if enable_quant_kv_cache else "Disabled"} KV cache quantization')
quant_cfg["quant_cfg"]["*output_quantizer"] = {
"num_bits": 8 if algorithm == "int8_sq" else (4, 3),
Expand All @@ -212,7 +212,7 @@ def quantize(self, model: MegatronParallel, forward_loop=None):

unwrapped_model = mtq.quantize(unwrapped_model, quant_cfg, forward_loop)

if decoder_type == "gptnext":
if decoder_type == "gpt":
# We found squared_relu may have an under-calibration problem.
# Clamp the scaling_factor with a min threshold to avoid under-calibration.
match algorithm:
Expand Down
6 changes: 3 additions & 3 deletions nemo/collections/llm/quantization/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ def get_modelopt_decoder_type(model: llm.GPTModel) -> str:
(llm.LlamaModel, "llama"),
(llm.MistralModel, "llama"),
(llm.MixtralModel, "llama"),
(llm.NemotronModel, "gptnext"),
(llm.NemotronModel, "gpt"),
(llm.Qwen2Model, "qwen"),
(llm.StarcoderModel, "gptnext"),
(llm.Starcoder2Model, "gptnext"),
(llm.StarcoderModel, "gpt"),
(llm.Starcoder2Model, "gpt"),
(llm.Phi3Model, "phi3"),
]

Expand Down
4 changes: 2 additions & 2 deletions nemo/export/quantize/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def __init__(self, quantization_config: Optional[DictConfig], export_config: Opt
enable_quant_kv_cache = quantization_config.get("enable_kv_cache", None)
if enable_quant_kv_cache is None:
enable_quant_kv_cache = (
"int8" not in quantization_config.algorithm and quantization_config.decoder_type != "gptnext"
"int8" not in quantization_config.algorithm and quantization_config.decoder_type != "gpt"
)
logging.info(f'{"Enabled" if enable_quant_kv_cache else "Disabled"} KV cache quantization')
quant_cfg["quant_cfg"]["*output_quantizer"] = {
Expand Down Expand Up @@ -196,7 +196,7 @@ def quantize(self, model: MegatronGPTModel, forward_loop: Callable[[MegatronGPTM

model = mtq.quantize(model, self.quant_cfg, forward_loop)

if self.quantization_config.decoder_type == "gptnext":
if self.quantization_config.decoder_type == "gpt":
# We found squared_relu may have an under-calibration problem.
# Clamp the scaling_factor with a min threshold to avoid under-calibration.
maxbound = 0
Expand Down
Loading