Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions nemo/collections/llm/quantization/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,12 @@ def quantizable_model_config(model_cfg: llm.GPTConfig) -> llm.GPTConfig:
get_gpt_layer_modelopt_spec,
)

model_cfg.transformer_layer_spec = get_gpt_layer_modelopt_spec()
model_cfg.transformer_layer_spec = get_gpt_layer_modelopt_spec(num_experts=model_cfg.num_moe_experts)
if model_cfg.sequence_parallel:
logging.warning("Disabling sequence parallelism for quantization...")
model_cfg.sequence_parallel = False
# Only custom ModelOpt spec is supported for Quantization: this custom spec is largely based on local Megatron-LM
# layer definitions to avoid Transformer Engine implementations that are currently not supported.
# Only custom ModelOpt spec is supported for quantization: this custom spec is largely based on local
# Megatron-LM layer definitions to avoid Transformer Engine implementations that are currently not supported.
# This layer spec also requires RoPE fusion to be disabled for tensor view operations in attention
# layer implementation from megatron/core/transformer/dot_product_attention.py to be functional.
model_cfg.name = "modelopt"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional

try:
from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
Expand All @@ -37,14 +39,20 @@


# Use this spec for Model Optimizer PTQ and TensorRT-LLM export
def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
def get_gpt_layer_modelopt_spec(num_experts: Optional[int] = None) -> ModuleSpec:
"""Mix the native spec with TENorm and TEDotProductAttention.

This is essentially the native local spec except for the layernorm implementation
is using TENorm from Transformer-Engine. This TENorm supports both FusedLayerNorm and RMSNorm and
prevents the apex dependency.

TEDotProductAttention is used to support sliding window attention.

Args:
num_experts (int): Number of experts. Defaults to None.

Returns:
ModuleSpec: Module specification with Megatron-Core modules.
"""
if not HAVE_MEGATRON_CORE:
raise IMPORT_ERROR
Expand Down Expand Up @@ -79,7 +87,7 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:


# Helper function to get module spec for MLP/MoE
def _get_mlp_module_spec(num_experts: int = None) -> ModuleSpec:
def _get_mlp_module_spec(num_experts: Optional[int] = None) -> ModuleSpec:
if num_experts is None:
# Dense MLP w/ or w/o TE modules.
return ModuleSpec(
Expand Down
7 changes: 2 additions & 5 deletions nemo/export/quantize/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,6 @@ def modify_model_config(model_cfg: DictConfig) -> DictConfig:
if model_cfg.get("sequence_parallel", False):
logging.warning("Disabling sequence parallelism for quantization...")
model_cfg.sequence_parallel = False
# Only custom ModelOpt spec is supported for Quantization: this custom spec is largely based on local Megatron-LM
# layer definitions to avoid Transformer Engine implementations that are currently not supported.
# This layer spec also requires RoPE fusion to be disabled for tensor view operations in attention
# layer implementation from megatron/core/transformer/dot_product_attention.py to be functional.
model_cfg.name = "modelopt"
model_cfg.apply_rope_fusion = False

Expand Down Expand Up @@ -248,7 +244,8 @@ def export(self, model: MegatronGPTModel):
)
dist.barrier() # Wait until all ranks complete export_model_config step
logging.info(
f"Exporting quantized weights, model artifacts, and tokenizer config to {self.export_config.save_path}..."
"Exporting quantized weights, model artifacts,"
f" and tokenizer config to {self.export_config.save_path}..."
)
if dist.get_rank() == 0:
save_artifacts(model, export_dir)
Expand Down