From e6b0db18fe832eb2ecab65b10a7200f99f6fd2f3 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Fri, 16 Feb 2024 18:15:50 +0100 Subject: [PATCH 01/30] AMMO integration with Llama2 PTQ example and tests Signed-off-by: Jan Lasek --- Dockerfile | 2 + .../conf/megatron_llama_quantization.yaml | 35 ++++ .../megatron_llama_quantization.py | 71 ++++++++ .../language_modeling/megatron/model_specs.py | 41 +++++ .../language_modeling/megatron_gpt_model.py | 2 + nemo/export/__init__.py | 0 nemo/export/quantize/__init__.py | 1 + nemo/export/quantize/quantizer.py | 158 ++++++++++++++++++ nemo/export/quantize/utils_wip.py | 62 +++++++ tests/setup/__main__.py | 31 ++++ tests/setup/data/create_sample_jsonl.py | 44 +++++ tests/setup/models/create_hf_model.py | 88 ++++++++++ 12 files changed, 535 insertions(+) create mode 100644 examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml create mode 100644 examples/nlp/language_modeling/megatron_llama_quantization.py create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/model_specs.py create mode 100644 nemo/export/__init__.py create mode 100644 nemo/export/quantize/__init__.py create mode 100644 nemo/export/quantize/quantizer.py create mode 100644 nemo/export/quantize/utils_wip.py create mode 100644 tests/setup/__main__.py create mode 100644 tests/setup/data/create_sample_jsonl.py create mode 100644 tests/setup/models/create_hf_model.py diff --git a/Dockerfile b/Dockerfile index 90c84ea07627..da3f1100be2f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -132,6 +132,8 @@ RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-chec RUN pip install flash-attn # install numba for latest containers RUN pip install numba>=0.57.1 +# install AMMO # TODO: add to requirements +RUN pip install nvidia-ammo==0.7.2 --extra-index-url https://pypi.nvidia.com --no-cache-dir # copy nemo source into a scratch image FROM scratch as nemo-src diff --git a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml new file mode 100644 index 000000000000..5603aa9c92ba --- /dev/null +++ b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml @@ -0,0 +1,35 @@ +inference: + greedy: false # Whether or not to use sampling ; use greedy decoding otherwise + top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. + top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. + temperature: 1.0 # sampling temperature + add_BOS: true # add the bos token at the begining of the prompt + tokens_to_generate: 30 # The minimum length of the sequence to be generated. + all_probs: false # whether return the log prob for all the tokens in vocab + repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. + min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. + compute_logprob: false # a flag used to compute logprob of all the input text, a very special case of running inference, default False + batch_size: 4 # batch size for inference + max_context_length: 512 # max length of the context, input sequence will be truncated if it is longer than this + +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + logger: false # logger provided by exp_manager + precision: bf16 # 16, 32, or bf16 + enable_checkpointing: false + +quantization: + quantize_bmm1: false + algorithm: fp8 # int8_sq, fp8, int8, int4_awq + calib_dataset: cnn_dailymail # pileval, wikitext, cnn_dailymail + num_calib_size: 128 # number of samples used for calibration + +tensor_model_parallel_size: 1 +pipeline_model_parallel_size: 1 +decoder_type: llama # gptnext, llama +model_file: llama2-7b-fp16.nemo # nemo file path +model_save_path: llama2-7b-fp16.qnemo # Path where the quantized model will be saved +inference_tensor_parallel: 1 # Default using 1 TP for inference +dtype: 16 # Default precision data type diff --git a/examples/nlp/language_modeling/megatron_llama_quantization.py b/examples/nlp/language_modeling/megatron_llama_quantization.py new file mode 100644 index 000000000000..fcb8fa0e86d5 --- /dev/null +++ b/examples/nlp/language_modeling/megatron_llama_quantization.py @@ -0,0 +1,71 @@ +import torch +import torch.multiprocessing as mp +from datasets import load_dataset + +from nemo.core.config import hydra_runner +from nemo.export.quantize import Quantizer + +mp.set_start_method("spawn", force=True) + +""" +Nemo quantization example script. + +Please consult nemo.export.quantize.Quantizer class +and examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml config on available quantization methods, +models supported as well as how to set up data and inference for calibration (with defaults recommended). + +Example usage: +``` +python examples/nlp/language_modeling/megatron_llama_quantization.py \ + model_file=llama2-7b-fp16.nemo \ + decoder_type=llama \ + quantization.algorithm=int8_sq \ + model_save_path=llama2-7b-fp16.qnemo +``` +""" + + +def get_calib_dataloader(data="cnn_dailymail", batch_size=4, calib_size=512, max_sequence_length=512): + if data == "pileval": + dataset = load_dataset("json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train") + text_column = "text" + elif data == "wikitext": + dataset = load_dataset("wikitext", "wikitext-103-v1", split="train") + text_column = "text" + elif data == "cnn_dailymail": + dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train") + text_column = "article" + else: + # Assume a local JSON dataset with a column named "text" + dataset = load_dataset("json", data_files=data, split="train") + text_column = "text" + calib_size = max(min(len(dataset), calib_size), batch_size) + for i in range(calib_size // batch_size): + batch = dataset[i * batch_size : (i + 1) * batch_size][text_column] + for j in range(len(batch)): + batch[j] = batch[j][:max_sequence_length] + yield batch + + +@hydra_runner(config_path="conf", config_name="megatron_llama_quantization") +def main(cfg) -> None: + if not torch.cuda.is_available(): + raise EnvironmentError("GPU is required for the inference.") + + quantizer = Quantizer(cfg.quantization, cfg.inference, cfg.trainer) + + dataloader = get_calib_dataloader( + cfg.quantization.calib_dataset, + cfg.inference.batch_size, + cfg.quantization.num_calib_size, + cfg.inference.max_context_length, + ) + dataloader = [data for data in dataloader] + + model = quantizer.quantize(cfg.model_file, dataloader, cfg.tensor_model_parallel_size) + + quantizer.export(model, cfg.model_save_path, cfg.decoder_type, cfg.dtype, cfg.inference_tensor_parallel) + + +if __name__ == '__main__': + main() diff --git a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py new file mode 100644 index 000000000000..8b9b6868b32c --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py @@ -0,0 +1,41 @@ +# TODO: This will be a part of MCore +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear +from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules +from megatron.core.transformer.custom_layers.transformer_engine import TENorm +from megatron.core.transformer.dot_product_attention import DotProductAttention +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.mlp import MLP, MLPSubmodules +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules + + +def get_gpt_layer_ammo_spec() -> ModuleSpec: + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + input_layernorm=TENorm, + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=ColumnParallelLinear, + core_attention=DotProductAttention, + linear_proj=RowParallelLinear, + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=TENorm, + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear, + ), + ), + mlp_bda=get_bias_dropout_add, + sharded_state_dict_keys_map={ + 'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', + 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_', + }, + ), + ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 5cd4ccf380eb..8c213b0d04f6 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -43,6 +43,7 @@ from nemo.collections.nlp.models.language_modeling.megatron.gpt_full_te_layer_autocast_spec import ( get_gpt_full_te_layer_autocast_spec, ) +from nemo.collections.nlp.models.language_modeling.megatron.model_specs import get_gpt_layer_ammo_spec from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel from nemo.collections.nlp.modules.common.megatron.build_model import build_model @@ -139,6 +140,7 @@ def get_specs(spec_name, num_experts=None): "": get_gpt_layer_with_transformer_engine_spec(num_experts), "megatron_falcon_gpt": get_falcon_layer_spec(), "megatron_gpt_full_te_layer_autocast": get_gpt_full_te_layer_autocast_spec(), + "ammo": get_gpt_layer_ammo_spec(), } if spec_name not in name_spec_dict: raise ValueError(f"Spec name '{spec_name}' is not recognized.") diff --git a/nemo/export/__init__.py b/nemo/export/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/nemo/export/quantize/__init__.py b/nemo/export/quantize/__init__.py new file mode 100644 index 000000000000..f89c700da6fe --- /dev/null +++ b/nemo/export/quantize/__init__.py @@ -0,0 +1 @@ +from .quantizer import Quantizer diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py new file mode 100644 index 000000000000..e6d0b997d382 --- /dev/null +++ b/nemo/export/quantize/quantizer.py @@ -0,0 +1,158 @@ +import copy +import os +import tarfile +from typing import Optional + +import ammo.torch.quantization as atq +import torch.distributed as dist +from ammo.torch.export import export_model_config +from ammo.torch.utils import print_rank_0 +from megatron.core import parallel_state +from omegaconf import OmegaConf +from omegaconf.omegaconf import DictConfig, open_dict +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector +from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision +from nemo.utils import logging +from nemo.utils.get_rank import is_global_rank_zero + +from .utils_wip import copy_artifacts, temporary_directory # TODO: Find a good place for these utils + +QUANT_CFG_CHOICES = { + "int8": atq.INT8_DEFAULT_CFG, + "int8_sq": atq.INT8_SMOOTHQUANT_CFG, + "fp8": atq.FP8_DEFAULT_CFG, + "int4_awq": atq.INT4_AWQ_CFG, + "w4a8_awq": atq.W4A8_AWQ_BETA_CFG, +} + + +class Quantizer: + + """ + Post-training quantization of Nemo checkpoints. + + PTQ converts selected model layers to low-precision format (e.g., INT4, FP8) for efficient serving. + The process consist of several steps: + + 1. Loading a Nemo model from disk using appropriate parallelism strategy + 2. Calibrating the model to obtain appropriate algorithm-specific scaling factors + 3. Producing .qnemo tarball with model config (JSON), quantized weights (safetensors) + and tokenizer config (yaml). + + The .qnemo file produced is intended consumed by TensorRT-LLM toolbox for inference. + This can be achieved using Nemo inference containers. + + Currently supported and tested model family is Llama2. Model type needs to be specified in + the quantization command with decoder_type parameter on exporting (see below). Quantizing other + model families is experimental and might not be fully supported. + + Available quantization methods are listed in QUANT_CFG_CHOICES dictionary on top of this file. + Please consult AMMO docummentation for details. You can also ispect different choices in + examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml for quantization algorithms and + calibration data as well as recommended settings. + """ + + def __init__(self, quantization_config: DictConfig, inference_config: DictConfig, trainer_config: DictConfig): + self.quantization_config = quantization_config + self.inference_config = inference_config + self.trainer_config = trainer_config + assert self.quantization_config.algorithm in QUANT_CFG_CHOICES + atq_config = QUANT_CFG_CHOICES[quantization_config.algorithm] + if quantization_config.algorithm != "fp8": + # disable quantization for the last output layer + atq_config = copy.deepcopy(atq_config) + atq_config["quant_cfg"]["*.output_layer.*"] = {"enable": False} + self.atq_config = atq_config + + def _load_model(self, model_file, tensor_model_parallel_size: Optional[int] = None): + trainer = Trainer(strategy=NLPDDPStrategy(), **self.trainer_config) + connector = NLPSaveRestoreConnector() + + if os.path.isdir(model_file): + connector.model_extracted_dir = model_file + + model_cfg = self._restore_and_modify_config(model_file, trainer, connector, tensor_model_parallel_size) + + model = MegatronGPTModel.restore_from( + restore_path=model_file, + trainer=trainer, + override_config_path=model_cfg, + save_restore_connector=connector, + ) + model.freeze() + + try: + model.model.module.language_model.encoder.activations_checkpoint_method = None + except AttributeError: + pass + print_rank_0(model) + self._check_ddp_initialized(model) + return model + + def _check_ddp_initialized(self, model): + if parallel_state.is_unitialized(): + + def dummy(): + return + + if model.trainer.strategy.launcher is not None: + model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer) + model.trainer.strategy.setup_environment() + + def _restore_and_modify_config( + self, + model_file: str, + trainer: Trainer, + connector: NLPSaveRestoreConnector, + tensor_model_parallel_size: Optional[int] = None, + ): + model_cfg = MegatronGPTModel.restore_from( + restore_path=model_file, + trainer=trainer, + save_restore_connector=connector, + return_config=True, + ) + with open_dict(model_cfg): + model_cfg.activations_checkpoint_method = None + model_cfg.activations_checkpoint_granularity = None + if tensor_model_parallel_size is not None: + model_cfg.tensor_model_parallel_size = tensor_model_parallel_size + model_cfg.name = "ammo" # Model needs to be loaded in "ammo" layer spec + + return model_cfg + + def quantize(self, model_file: str, dataloader, tensor_model_parallel_size: Optional[int] = None): + model = self._load_model(model_file, tensor_model_parallel_size) + model.set_inference_config(OmegaConf.to_container(self.inference_config)) + + def forward_loop(): + for i, batch in enumerate(dataloader): + print_rank_0(f"Calibrating batch {i}") + model.predict_step(batch, i) + + atq.quantize(model, self.atq_config, forward_loop) + return model + + def export(self, model, output_file: str, decoder_type: str, dtype: str, inference_tensor_parallel: int): + supported_dtype = [16, "16", "bf16"] # FIXME: Move that to top + assert dtype in supported_dtype, f"{dtype} not supported. Supported dtypes are {supported_dtype}" + torch_dtype = torch_dtype_from_precision(dtype) + + with temporary_directory() as tmp_dir: + export_model_config( + model=model, + decoder_type=decoder_type, + dtype=torch_dtype, + export_dir=tmp_dir, + inference_tensor_parallel=inference_tensor_parallel, + ) + dist.barrier() # Wait until all ranks complete export_model_config step + if is_global_rank_zero(): + logging.info(f"Exporting quantized weights, tokenizer config, and model artifacts to {output_file}...") + with tarfile.open(output_file, "w:gz") as tar: + config = copy_artifacts(model, tmp_dir) + OmegaConf.save(config.tokenizer, os.path.join(tmp_dir, "tokenizer_config.yaml")) + tar.add(tmp_dir, arcname="./") diff --git a/nemo/export/quantize/utils_wip.py b/nemo/export/quantize/utils_wip.py new file mode 100644 index 000000000000..58ef8f09769e --- /dev/null +++ b/nemo/export/quantize/utils_wip.py @@ -0,0 +1,62 @@ +import contextlib +import copy +import os +import shutil +import tarfile +import tempfile + +import torch +import torch.distributed as dist +from omegaconf import OmegaConf + +from nemo.utils.app_state import AppState +from nemo.utils.get_rank import get_rank, is_global_rank_zero + + +@contextlib.contextmanager +def temporary_directory(): + """Create a shared temporary directory across ranks in distributed setup. + + This function assumes that the distributed setup has been already + correctly initialized. It is intended to be used only in single-node + setup so that all ranks can access the directory created.""" + + if is_global_rank_zero(): + tmp_dir = [tempfile.TemporaryDirectory()] + else: + tmp_dir = [None] + torch.distributed.broadcast_object_list(tmp_dir) + print(f"[{get_rank()}] tmp_dir={tmp_dir}") # TODO: remove debug print + yield tmp_dir[0].name + # We use barrier below to make sure that rank zero won't exit + # and delete tmp_dir while other ranks may still use it + dist.barrier() + + +def copy_artifacts(model, output_dir: str): + """Copy all model artifacts to a given output directory and return modified config.""" + app_state = AppState() + model_file = app_state.model_restore_path + model_config = copy.deepcopy(model.cfg) + + # Setup model file handling context: directory or tarball + if os.path.isfile(model_file): + model_file_handler = tarfile.open + kwargs = {"name": model_file, "mode": "r:"} + elif os.path.isdir(model_file): + model_file_handler = contextlib.nullcontext + kwargs = {} + else: + raise FileNotFoundError(model_file) + + # Copy or extract artifacts depending on the context + with model_file_handler(**kwargs) as maybe_tar: + for arti_name, arti_item in model.artifacts.items(): + _, arti_file = arti_item.path.split("nemo:") + if maybe_tar is not None: + maybe_tar.extract(f"./{arti_file}", path=output_dir) + else: + shutil.copy(os.path.join(model_file, arti_file), output_dir) + # Update artifact path to basename + OmegaConf.update(model_config, arti_name, os.path.basename(arti_file)) + return model_config diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py new file mode 100644 index 000000000000..5750122eb1be --- /dev/null +++ b/tests/setup/__main__.py @@ -0,0 +1,31 @@ +import argparse +import os + +from .data.create_sample_jsonl import create_sample_jsonl +from .models.create_hf_model import create_hf_model + +print("Setup test data and models...") + +parser = argparse.ArgumentParser("Setup test data and models.") +parser.add_argument("--data_dir", required=True, help="Root save directory for data") +parser.add_argument("--model_dir", required=True, help="Root save directory for models") +parser.add_argument("--overwrite", action="store_true", help="Overwrite existing files and directories") +args = parser.parse_args() + +print(f"Arguments are: {vars(args)}") + +os.makedirs(args.data_dir, exist_ok=True) +os.makedirs(args.model_dir, exist_ok=True) + +create_sample_jsonl( + os.path.join(args.data_dir, "test_quantization", "test.json"), + args.overwrite, +) + +create_hf_model( + "meta-llama/Llama-2-7b-hf", + os.path.join(args.model_dir, "tiny_llama2_hf"), + {"hidden_size": 128, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4}, + args.overwrite, +) +print("Setup done.") diff --git a/tests/setup/data/create_sample_jsonl.py b/tests/setup/data/create_sample_jsonl.py new file mode 100644 index 000000000000..ee9bd3b48f7e --- /dev/null +++ b/tests/setup/data/create_sample_jsonl.py @@ -0,0 +1,44 @@ +import argparse +import json +import os + +""" +Create sample JSONL file for functional testing. Each line contains a dictionary +with a single element "text" for storing data. +""" + + +def create_sample_jsonl(output_file: str, overwrite: bool = False): + """Create sample JSONL.""" + if os.path.isfile(output_file) and not overwrite: + print(f"File {output_file} exists and overwrite flag is not set so exiting.") + return + + texts = [ + "Sample data for functional tests", + "Once upon a time, in the middle of a dense forest, there was a small house, where lived a pretty little girl " + "named Little Red Riding Hood.", + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore " + "magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea " + "commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat " + "nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit " + "anim id est laborum...", + "Next please!", + "¡H E L L O W O R L D!", + "Korzystając z okazji chciałbym pozdrowić całą moją rodzinę i przyjaciół", + ] + print(f"Writing {len(texts)} line(s) to {output_file}...") + os.makedirs(os.path.dirname(output_file), exist_ok=True) + with open(output_file, mode="w", encoding="utf-8") as f: + for text in texts: + json.dump({"text": text}, f) + f.write("\n") + print("OK.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Create sample JSONL file.") + parser.add_argument("--output_file", help="Output file name") + parser.add_argument("--overwrite", action="store_true", help="Overwrite file if it exists") + args = parser.parse_args() + create_sample_jsonl(args.output_file) diff --git a/tests/setup/models/create_hf_model.py b/tests/setup/models/create_hf_model.py new file mode 100644 index 000000000000..5d40d9742628 --- /dev/null +++ b/tests/setup/models/create_hf_model.py @@ -0,0 +1,88 @@ +import argparse +import json +import os + +from typing import Any, Dict, Optional + +import transformers + +""" +Create a randomly initialized HuggingFace model for testing purposes. + +Model can be specified by name or path for creating its config and tokenizer using +HuggingFace transformers AutoConfig and AutoTokenizer functions. + +Parameter config_updates can be used to override specific model config fields to make +it smaller, for example, by changing number of layers or hidden layers dimensionality, +making it adequate for testing purposes. This parameter should be specified as +a dictionary that can be parsed using json.loads method. + +Example usage for Llama2 model (requires HF login): +``` +python tests/setup/models/create_tiny_hf_model.py \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --output_dir tiny_llama2_hf \ + --config_updates '{"hidden_size": 128, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4}' +``` +""" + + +def get_hf_model_class(hf_config): + """Get HuggingFace model class from config.""" + if len(hf_config.architectures) > 1: + print(f"More than one model architecture available, choosing 1st: {hf_config.architectures}") + model_name = hf_config.architectures[0] + model_class = getattr(transformers, model_name) + return model_class + + +def create_hf_model( + model_name_or_path: str, output_dir: str, config_updates: Optional[Dict[str, Any]] = None, overwrite: bool = False +): + """Create HuggingFace model with optional config updates.""" + if os.path.isdir(output_dir) and not overwrite: + print(f"Output directory {output_dir} exists and overwrite flag is not set so exiting.") + return + + hf_config = transformers.AutoConfig.from_pretrained(model_name_or_path) + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path) + model_class = get_hf_model_class(hf_config) + + if config_updates is not None: + hf_config.update(config_updates) + print(hf_config) + + model = model_class(hf_config) + print(model) + + os.makedirs(output_dir, exist_ok=True) + print(f"Saving model to {output_dir}...") + tokenizer.save_pretrained(output_dir) + model.save_pretrained(output_dir) + print("OK.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Create a HuggingFace model (random initialization) for testing purposes.") + parser.add_argument( + "--model_name_or_path", + required=True, + help="Model name or local path with model config and tokenizer", + ) + parser.add_argument( + "--output_dir", + required=True, + help="Output directory", + ) + parser.add_argument( + "--config_updates", + type=json.loads, + help="Parameter updates in JSON format to overwrite for model config", + ) + parser.add_argument( + "--overwrite", + action="store_true", + help="Overwrite file if it exists", + ) + args = parser.parse_args() + create_hf_model(args.model_name_or_path, args.output_dir, args.config_updates) From 41b3f6d24870adfb638c787bcda2658ce11f23fb Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Fri, 16 Feb 2024 19:09:58 +0100 Subject: [PATCH 02/30] Jenkins megatron_llama_quantization.py test setup Signed-off-by: Jan Lasek --- Jenkinsfile | 37 ++++++++++++++++++++++++++++++++++++- tests/setup/__main__.py | 18 ++++++++---------- 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 0625f469ce11..8ab088482491 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -96,6 +96,13 @@ pipeline { } } + // TODO: AMMO installation - move to requirements + stage('AMMO installation') { + steps { + sh 'pip install nvidia-ammo==0.7.2 --extra-index-url https://pypi.nvidia.com --no-cache-dir' + } + } + stage('PyTorch Lightning version') { steps { sh 'python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"' @@ -390,6 +397,12 @@ pipeline { } } + stage('Setup test data and models') { + steps { + sh 'python -m tests.setup --save_dir /home/TestData/nlp' + } + } + // TODO: this requires TE >= v0.11 which is not available in 23.06. // please uncomment this test once mcore CI is ready. stage('L2: Community LLM Checkpoints tests') { @@ -407,7 +420,6 @@ pipeline { --in-file=/home/TestData/nlp/megatron_llama/llama-ci-hf \ --out-file=/home/TestData/nlp/megatron_llama/ci.nemo \ --precision=16' - sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo' } } stage('StarCoder') { @@ -439,6 +451,29 @@ pipeline { } } + stage('L2: Nemo PTQ') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('Llama') { + steps { + sh 'CUDA_VISIBLE_DEVICES=0 python examples/nlp/language_modeling/megatron_llama_quantization.py \ + model_file=/home/TestData/nlp/megatron_llama/ci.nemo \ + quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ + quantization.algorithm=int8_sq \ + quantization.num_calib_size=8 \ + inference.batch_size=2 \ + model_save_path=/home/TestData/nlp/megatron_llama/ci.qnemo' + sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo' + } + } + } + } stage('L2: ASR dev run') { when { anyOf { diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py index 5750122eb1be..707579e24350 100644 --- a/tests/setup/__main__.py +++ b/tests/setup/__main__.py @@ -7,25 +7,23 @@ print("Setup test data and models...") parser = argparse.ArgumentParser("Setup test data and models.") -parser.add_argument("--data_dir", required=True, help="Root save directory for data") -parser.add_argument("--model_dir", required=True, help="Root save directory for models") +parser.add_argument("--save_dir", required=True, help="Root save directory for artifacts") parser.add_argument("--overwrite", action="store_true", help="Overwrite existing files and directories") args = parser.parse_args() print(f"Arguments are: {vars(args)}") -os.makedirs(args.data_dir, exist_ok=True) -os.makedirs(args.model_dir, exist_ok=True) +os.makedirs(args.save_dir, exist_ok=True) create_sample_jsonl( - os.path.join(args.data_dir, "test_quantization", "test.json"), - args.overwrite, + output_file=os.path.join(args.save_dir, "test_quantization", "test.json"), + overwrite=args.overwrite, ) create_hf_model( - "meta-llama/Llama-2-7b-hf", - os.path.join(args.model_dir, "tiny_llama2_hf"), - {"hidden_size": 128, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4}, - args.overwrite, + model_name_or_path="/home/TestData/nlp/megatron_llama/llama-ci-hf", # FIXME: change to "meta-llama/Llama-2-7b-hf" + output_dir=os.path.join(args.save_dir, "tiny_llama2_hf"), + config_updates={"hidden_size": 128, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4}, + overwrite=args.overwrite, ) print("Setup done.") From 71d952996812512467da42073409efa86e33adfa Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 Feb 2024 18:35:51 +0000 Subject: [PATCH 03/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../language_modeling/megatron/model_specs.py | 9 ++------- .../language_modeling/megatron_gpt_model.py | 1 + nemo/export/quantize/quantizer.py | 10 ++-------- tests/setup/__main__.py | 3 +-- tests/setup/models/create_hf_model.py | 16 ++++------------ 5 files changed, 10 insertions(+), 29 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py index 8b9b6868b32c..9f0b3ac5ca74 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py @@ -19,18 +19,13 @@ def get_gpt_layer_ammo_spec() -> ModuleSpec: module=SelfAttention, params={"attn_mask_type": AttnMaskType.causal}, submodules=SelfAttentionSubmodules( - linear_qkv=ColumnParallelLinear, - core_attention=DotProductAttention, - linear_proj=RowParallelLinear, + linear_qkv=ColumnParallelLinear, core_attention=DotProductAttention, linear_proj=RowParallelLinear, ), ), self_attn_bda=get_bias_dropout_add, pre_mlp_layernorm=TENorm, mlp=ModuleSpec( - module=MLP, - submodules=MLPSubmodules( - linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear, - ), + module=MLP, submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,), ), mlp_bda=get_bias_dropout_add, sharded_state_dict_keys_map={ diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 8c213b0d04f6..d5024495889e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -45,6 +45,7 @@ ) from nemo.collections.nlp.models.language_modeling.megatron.model_specs import get_gpt_layer_ammo_spec from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel +from nemo.collections.nlp.models.language_modeling.megatron.model_specs import get_gpt_layer_ammo_spec from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel from nemo.collections.nlp.modules.common.megatron.build_model import build_model from nemo.collections.nlp.modules.common.megatron.module import Float16Module diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index e6d0b997d382..cfc89db5f05c 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -77,10 +77,7 @@ def _load_model(self, model_file, tensor_model_parallel_size: Optional[int] = No model_cfg = self._restore_and_modify_config(model_file, trainer, connector, tensor_model_parallel_size) model = MegatronGPTModel.restore_from( - restore_path=model_file, - trainer=trainer, - override_config_path=model_cfg, - save_restore_connector=connector, + restore_path=model_file, trainer=trainer, override_config_path=model_cfg, save_restore_connector=connector, ) model.freeze() @@ -110,10 +107,7 @@ def _restore_and_modify_config( tensor_model_parallel_size: Optional[int] = None, ): model_cfg = MegatronGPTModel.restore_from( - restore_path=model_file, - trainer=trainer, - save_restore_connector=connector, - return_config=True, + restore_path=model_file, trainer=trainer, save_restore_connector=connector, return_config=True, ) with open_dict(model_cfg): model_cfg.activations_checkpoint_method = None diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py index 707579e24350..8773a8b7fa55 100644 --- a/tests/setup/__main__.py +++ b/tests/setup/__main__.py @@ -16,8 +16,7 @@ os.makedirs(args.save_dir, exist_ok=True) create_sample_jsonl( - output_file=os.path.join(args.save_dir, "test_quantization", "test.json"), - overwrite=args.overwrite, + output_file=os.path.join(args.save_dir, "test_quantization", "test.json"), overwrite=args.overwrite, ) create_hf_model( diff --git a/tests/setup/models/create_hf_model.py b/tests/setup/models/create_hf_model.py index 5d40d9742628..dd5d98251e64 100644 --- a/tests/setup/models/create_hf_model.py +++ b/tests/setup/models/create_hf_model.py @@ -65,24 +65,16 @@ def create_hf_model( if __name__ == "__main__": parser = argparse.ArgumentParser("Create a HuggingFace model (random initialization) for testing purposes.") parser.add_argument( - "--model_name_or_path", - required=True, - help="Model name or local path with model config and tokenizer", + "--model_name_or_path", required=True, help="Model name or local path with model config and tokenizer", ) parser.add_argument( - "--output_dir", - required=True, - help="Output directory", + "--output_dir", required=True, help="Output directory", ) parser.add_argument( - "--config_updates", - type=json.loads, - help="Parameter updates in JSON format to overwrite for model config", + "--config_updates", type=json.loads, help="Parameter updates in JSON format to overwrite for model config", ) parser.add_argument( - "--overwrite", - action="store_true", - help="Overwrite file if it exists", + "--overwrite", action="store_true", help="Overwrite file if it exists", ) args = parser.parse_args() create_hf_model(args.model_name_or_path, args.output_dir, args.config_updates) From 9c2d7f4996c8098275566008a4d58610f8b06d56 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Mon, 19 Feb 2024 10:55:32 +0100 Subject: [PATCH 04/30] License headers Signed-off-by: Jan Lasek --- .../megatron_llama_quantization.py | 14 ++++++++++++++ .../language_modeling/megatron/model_specs.py | 14 ++++++++++++++ nemo/export/__init__.py | 13 +++++++++++++ nemo/export/quantize/__init__.py | 14 ++++++++++++++ nemo/export/quantize/quantizer.py | 14 ++++++++++++++ nemo/export/quantize/utils_wip.py | 14 ++++++++++++++ tests/setup/__main__.py | 14 ++++++++++++++ tests/setup/data/create_sample_jsonl.py | 14 ++++++++++++++ tests/setup/models/create_hf_model.py | 14 ++++++++++++++ 9 files changed, 125 insertions(+) diff --git a/examples/nlp/language_modeling/megatron_llama_quantization.py b/examples/nlp/language_modeling/megatron_llama_quantization.py index fcb8fa0e86d5..5565900f901b 100644 --- a/examples/nlp/language_modeling/megatron_llama_quantization.py +++ b/examples/nlp/language_modeling/megatron_llama_quantization.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import torch import torch.multiprocessing as mp from datasets import load_dataset diff --git a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py index 9f0b3ac5ca74..d7fb633b3eda 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # TODO: This will be a part of MCore from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear diff --git a/nemo/export/__init__.py b/nemo/export/__init__.py index e69de29bb2d1..d9155f923f18 100644 --- a/nemo/export/__init__.py +++ b/nemo/export/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/export/quantize/__init__.py b/nemo/export/quantize/__init__.py index f89c700da6fe..87812e621bb6 100644 --- a/nemo/export/quantize/__init__.py +++ b/nemo/export/quantize/__init__.py @@ -1 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from .quantizer import Quantizer diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index cfc89db5f05c..37b635d30d2d 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import copy import os import tarfile diff --git a/nemo/export/quantize/utils_wip.py b/nemo/export/quantize/utils_wip.py index 58ef8f09769e..0406353401ff 100644 --- a/nemo/export/quantize/utils_wip.py +++ b/nemo/export/quantize/utils_wip.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import contextlib import copy import os diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py index 8773a8b7fa55..51cdab795a99 100644 --- a/tests/setup/__main__.py +++ b/tests/setup/__main__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import os diff --git a/tests/setup/data/create_sample_jsonl.py b/tests/setup/data/create_sample_jsonl.py index ee9bd3b48f7e..00f789548f81 100644 --- a/tests/setup/data/create_sample_jsonl.py +++ b/tests/setup/data/create_sample_jsonl.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import json import os diff --git a/tests/setup/models/create_hf_model.py b/tests/setup/models/create_hf_model.py index dd5d98251e64..9f57d5996dfc 100644 --- a/tests/setup/models/create_hf_model.py +++ b/tests/setup/models/create_hf_model.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import json import os From ae88e4744e43f0c7afc7d6bc8739a42a05a18209 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Mon, 19 Feb 2024 10:55:50 +0100 Subject: [PATCH 05/30] Add AMMO to requirements_nlp.txt with --extra-index-url for pip install Signed-off-by: Jan Lasek --- Dockerfile | 4 +--- Jenkinsfile | 7 ------- reinstall.sh | 2 +- requirements/requirements_nlp.txt | 1 + 4 files changed, 3 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index da3f1100be2f..de85b35bf253 100644 --- a/Dockerfile +++ b/Dockerfile @@ -126,14 +126,12 @@ RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_k2.sh); INSTALL WORKDIR /tmp/nemo ENV LHOTSE_REQUIRE_TORCHAUDIO=0 COPY requirements . -RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done +RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --extra-index-url https://pypi.nvidia.com --no-cache-dir -r $f; done # install flash attention RUN pip install flash-attn # install numba for latest containers RUN pip install numba>=0.57.1 -# install AMMO # TODO: add to requirements -RUN pip install nvidia-ammo==0.7.2 --extra-index-url https://pypi.nvidia.com --no-cache-dir # copy nemo source into a scratch image FROM scratch as nemo-src diff --git a/Jenkinsfile b/Jenkinsfile index 8ab088482491..8902151c603f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -96,13 +96,6 @@ pipeline { } } - // TODO: AMMO installation - move to requirements - stage('AMMO installation') { - steps { - sh 'pip install nvidia-ammo==0.7.2 --extra-index-url https://pypi.nvidia.com --no-cache-dir' - } - } - stage('PyTorch Lightning version') { steps { sh 'python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"' diff --git a/reinstall.sh b/reinstall.sh index d64b56103dd3..a5004590c7c1 100755 --- a/reinstall.sh +++ b/reinstall.sh @@ -34,7 +34,7 @@ else ${PIP} install build pytest-runner python -m build --no-isolation --wheel DIST_FILE=$(find ./dist -name "*.whl" | head -n 1) - ${PIP} install "${DIST_FILE}[all]" + ${PIP} install --extra-index-url https://pypi.nvidia.com "${DIST_FILE}[all]" fi echo 'All done!' diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index 2484328293e1..888e66d194b3 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -12,6 +12,7 @@ markdown2 matplotlib>=3.3.2 megatron_core==0.5.0 nltk>=3.6.5 +nvidia-ammo==0.7.2 opencc<1.1.7 pangu rapidfuzz From 6ca03d4cd8560b20d0eb4fe1b8a3e9433a2665c9 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Thu, 22 Feb 2024 15:22:53 +0100 Subject: [PATCH 06/30] Bump AMMO version to latest Signed-off-by: Jan Lasek --- requirements/requirements_nlp.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index 888e66d194b3..4bffca663d71 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -12,7 +12,7 @@ markdown2 matplotlib>=3.3.2 megatron_core==0.5.0 nltk>=3.6.5 -nvidia-ammo==0.7.2 +nvidia-ammo==0.7.3 opencc<1.1.7 pangu rapidfuzz From 5170db5c63c7902f9067114abe63cd63385f6370 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Thu, 22 Feb 2024 15:28:42 +0100 Subject: [PATCH 07/30] Guards workaround on spec definition Signed-off-by: Jan Lasek --- .../language_modeling/megatron/model_specs.py | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py index d7fb633b3eda..006f5d730045 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py @@ -12,19 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. -# TODO: This will be a part of MCore -from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add -from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear -from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules -from megatron.core.transformer.custom_layers.transformer_engine import TENorm -from megatron.core.transformer.dot_product_attention import DotProductAttention -from megatron.core.transformer.enums import AttnMaskType -from megatron.core.transformer.mlp import MLP, MLPSubmodules -from megatron.core.transformer.spec_utils import ModuleSpec -from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules +# TODO: This spec will be defined in MCore>=0.6.0 and is temporary +try: + + from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add + from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear + from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules + from megatron.core.transformer.custom_layers.transformer_engine import TENorm + from megatron.core.transformer.dot_product_attention import DotProductAttention + from megatron.core.transformer.enums import AttnMaskType + from megatron.core.transformer.mlp import MLP, MLPSubmodules + from megatron.core.transformer.spec_utils import ModuleSpec + from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + HAVE_MEGATRON_CORE = False + ModuleSpec = None def get_gpt_layer_ammo_spec() -> ModuleSpec: + assert HAVE_MEGATRON_CORE return ModuleSpec( module=TransformerLayer, submodules=TransformerLayerSubmodules( From 543dea1fa934fa6310c2fb20439efd9eef7e8b1f Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Thu, 29 Feb 2024 10:56:12 +0100 Subject: [PATCH 08/30] Save artifacts and tokenizer config at once Signed-off-by: Jan Lasek --- nemo/export/quantize/quantizer.py | 5 ++--- nemo/export/quantize/utils_wip.py | 20 ++++++++++++-------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index 37b635d30d2d..0e84d6bdad48 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -32,7 +32,7 @@ from nemo.utils import logging from nemo.utils.get_rank import is_global_rank_zero -from .utils_wip import copy_artifacts, temporary_directory # TODO: Find a good place for these utils +from .utils_wip import save_artifacts, temporary_directory # TODO: Find a good place for these utils QUANT_CFG_CHOICES = { "int8": atq.INT8_DEFAULT_CFG, @@ -161,6 +161,5 @@ def export(self, model, output_file: str, decoder_type: str, dtype: str, inferen if is_global_rank_zero(): logging.info(f"Exporting quantized weights, tokenizer config, and model artifacts to {output_file}...") with tarfile.open(output_file, "w:gz") as tar: - config = copy_artifacts(model, tmp_dir) - OmegaConf.save(config.tokenizer, os.path.join(tmp_dir, "tokenizer_config.yaml")) + save_artifacts(model, tmp_dir) tar.add(tmp_dir, arcname="./") diff --git a/nemo/export/quantize/utils_wip.py b/nemo/export/quantize/utils_wip.py index 0406353401ff..74ab5471de1e 100644 --- a/nemo/export/quantize/utils_wip.py +++ b/nemo/export/quantize/utils_wip.py @@ -47,11 +47,11 @@ def temporary_directory(): dist.barrier() -def copy_artifacts(model, output_dir: str): - """Copy all model artifacts to a given output directory and return modified config.""" +def save_artifacts(model, output_dir: str, use_abspath: bool = False) -> None: + """Save all model artifacts and tokenizer config to a given output directory.""" app_state = AppState() model_file = app_state.model_restore_path - model_config = copy.deepcopy(model.cfg) + model_cfg = copy.deepcopy(model.cfg) # Setup model file handling context: directory or tarball if os.path.isfile(model_file): @@ -66,11 +66,15 @@ def copy_artifacts(model, output_dir: str): # Copy or extract artifacts depending on the context with model_file_handler(**kwargs) as maybe_tar: for arti_name, arti_item in model.artifacts.items(): - _, arti_file = arti_item.path.split("nemo:") + arti_file = arti_item.path.removeprefix("nemo:") + arti_path = os.path.join(output_dir, arti_name) if maybe_tar is not None: maybe_tar.extract(f"./{arti_file}", path=output_dir) + os.rename(os.path.join(output_dir, arti_file), arti_path) else: - shutil.copy(os.path.join(model_file, arti_file), output_dir) - # Update artifact path to basename - OmegaConf.update(model_config, arti_name, os.path.basename(arti_file)) - return model_config + shutil.copy(os.path.join(model_file, arti_file), arti_path) + # Store artifact path as basename by default. Otherwise save absolute path but bear in mind + # that in this case output directory should be permanent for correct artifact recovery later + arti_path = os.path.abspath(arti_path) if use_abspath else os.path.basename(arti_path) + OmegaConf.update(model_cfg, arti_name, arti_path) + OmegaConf.save(model_cfg.tokenizer, os.path.join(output_dir, "tokenizer_config.yaml")) From 785167fe0d8d8b3f0097de048b9d6325060730f2 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Thu, 29 Feb 2024 11:27:31 +0100 Subject: [PATCH 09/30] Extend nemo.utils package with new tools Signed-off-by: Jan Lasek --- nemo/export/quantize/quantizer.py | 4 +- nemo/export/quantize/utils_wip.py | 80 ------------------------------- nemo/utils/distributed.py | 23 +++++++++ nemo/utils/model_utils.py | 36 ++++++++++++++ 4 files changed, 61 insertions(+), 82 deletions(-) delete mode 100644 nemo/export/quantize/utils_wip.py diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index 0e84d6bdad48..4f3ea26513cf 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -30,9 +30,9 @@ from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision from nemo.utils import logging +from nemo.utils.distributed import temporary_directory from nemo.utils.get_rank import is_global_rank_zero - -from .utils_wip import save_artifacts, temporary_directory # TODO: Find a good place for these utils +from nemo.utils.model_utils import save_artifacts QUANT_CFG_CHOICES = { "int8": atq.INT8_DEFAULT_CFG, diff --git a/nemo/export/quantize/utils_wip.py b/nemo/export/quantize/utils_wip.py deleted file mode 100644 index 74ab5471de1e..000000000000 --- a/nemo/export/quantize/utils_wip.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import contextlib -import copy -import os -import shutil -import tarfile -import tempfile - -import torch -import torch.distributed as dist -from omegaconf import OmegaConf - -from nemo.utils.app_state import AppState -from nemo.utils.get_rank import get_rank, is_global_rank_zero - - -@contextlib.contextmanager -def temporary_directory(): - """Create a shared temporary directory across ranks in distributed setup. - - This function assumes that the distributed setup has been already - correctly initialized. It is intended to be used only in single-node - setup so that all ranks can access the directory created.""" - - if is_global_rank_zero(): - tmp_dir = [tempfile.TemporaryDirectory()] - else: - tmp_dir = [None] - torch.distributed.broadcast_object_list(tmp_dir) - print(f"[{get_rank()}] tmp_dir={tmp_dir}") # TODO: remove debug print - yield tmp_dir[0].name - # We use barrier below to make sure that rank zero won't exit - # and delete tmp_dir while other ranks may still use it - dist.barrier() - - -def save_artifacts(model, output_dir: str, use_abspath: bool = False) -> None: - """Save all model artifacts and tokenizer config to a given output directory.""" - app_state = AppState() - model_file = app_state.model_restore_path - model_cfg = copy.deepcopy(model.cfg) - - # Setup model file handling context: directory or tarball - if os.path.isfile(model_file): - model_file_handler = tarfile.open - kwargs = {"name": model_file, "mode": "r:"} - elif os.path.isdir(model_file): - model_file_handler = contextlib.nullcontext - kwargs = {} - else: - raise FileNotFoundError(model_file) - - # Copy or extract artifacts depending on the context - with model_file_handler(**kwargs) as maybe_tar: - for arti_name, arti_item in model.artifacts.items(): - arti_file = arti_item.path.removeprefix("nemo:") - arti_path = os.path.join(output_dir, arti_name) - if maybe_tar is not None: - maybe_tar.extract(f"./{arti_file}", path=output_dir) - os.rename(os.path.join(output_dir, arti_file), arti_path) - else: - shutil.copy(os.path.join(model_file, arti_file), arti_path) - # Store artifact path as basename by default. Otherwise save absolute path but bear in mind - # that in this case output directory should be permanent for correct artifact recovery later - arti_path = os.path.abspath(arti_path) if use_abspath else os.path.basename(arti_path) - OmegaConf.update(model_cfg, arti_name, arti_path) - OmegaConf.save(model_cfg.tokenizer, os.path.join(output_dir, "tokenizer_config.yaml")) diff --git a/nemo/utils/distributed.py b/nemo/utils/distributed.py index b0d24de3e5b4..ee6c107b1d85 100644 --- a/nemo/utils/distributed.py +++ b/nemo/utils/distributed.py @@ -12,11 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import contextlib import os +import tempfile import torch +import torch.distributed as dist from nemo.utils import logging +from nemo.utils.get_rank import is_global_rank_zero try: from megatron.core import parallel_state @@ -100,3 +104,22 @@ def gather_objects(partial_results_list, main_rank=None): results_list.extend(r) return results_list + + +@contextlib.contextmanager +def temporary_directory(): + """Create a shared temporary directory across ranks in distributed setup. + + This function assumes that the distributed setup has been already + correctly initialized. It is intended to be used only in single-node + setup so that all ranks can access the directory created.""" + + if is_global_rank_zero(): + tmp_dir = [tempfile.TemporaryDirectory()] + else: + tmp_dir = [None] + dist.broadcast_object_list(tmp_dir) + yield tmp_dir[0].name + # We use barrier below to make sure that rank zero won't exit + # and delete tmp_dir while other ranks may still use it + dist.barrier() diff --git a/nemo/utils/model_utils.py b/nemo/utils/model_utils.py index b2a6abbf54aa..c7497511572a 100644 --- a/nemo/utils/model_utils.py +++ b/nemo/utils/model_utils.py @@ -12,9 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import contextlib import copy import importlib import os +import shutil +import tarfile from dataclasses import dataclass, is_dataclass from enum import Enum from functools import lru_cache @@ -636,3 +639,36 @@ def ckpt_to_dir(filepath: Union[str, Path]) -> Path: checkpoint_dir = filepath.with_name(filepath.stem) return checkpoint_dir + + +def save_artifacts(model, output_dir: str, use_abspath: bool = False) -> None: + """Save all model artifacts and tokenizer config to a given output directory.""" + app_state = AppState() + model_file = app_state.model_restore_path + model_cfg = copy.deepcopy(model.cfg) + + # Setup model file handling context: directory or tarball + if os.path.isfile(model_file): + model_file_handler = tarfile.open + kwargs = {"name": model_file, "mode": "r:"} + elif os.path.isdir(model_file): + model_file_handler = contextlib.nullcontext + kwargs = {} + else: + raise FileNotFoundError(model_file) + + # Copy or extract artifacts depending on the context + with model_file_handler(**kwargs) as maybe_tar: + for arti_name, arti_item in model.artifacts.items(): + _, arti_file = arti_item.path.split("nemo:") + arti_path = os.path.join(output_dir, arti_name) + if maybe_tar is not None: + maybe_tar.extract(f"./{arti_file}", path=output_dir) + os.rename(os.path.join(output_dir, arti_file), arti_path) + else: + shutil.copy(os.path.join(model_file, arti_file), arti_path) + # Store artifact path as basename by default. Otherwise save absolute path but bear in mind + # that in this case output directory should be permanent for correct artifact recovery later + arti_path = os.path.abspath(arti_path) if use_abspath else os.path.basename(arti_path) + OmegaConf.update(model_cfg, arti_name, arti_path) + OmegaConf.save(model_cfg.tokenizer, os.path.join(output_dir, "tokenizer_config.yaml")) From 0f29ac476e30628607a90be1f88d0e7df8e22c3d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 29 Feb 2024 12:00:17 +0000 Subject: [PATCH 10/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../nlp/models/language_modeling/megatron_gpt_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index d5024495889e..3e6ba169ff3c 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -43,7 +43,6 @@ from nemo.collections.nlp.models.language_modeling.megatron.gpt_full_te_layer_autocast_spec import ( get_gpt_full_te_layer_autocast_spec, ) -from nemo.collections.nlp.models.language_modeling.megatron.model_specs import get_gpt_layer_ammo_spec from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel from nemo.collections.nlp.models.language_modeling.megatron.model_specs import get_gpt_layer_ammo_spec from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel From acfb441375d7d2f5b390ab4fc3d74396a36d2c7b Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Mon, 4 Mar 2024 09:54:37 +0100 Subject: [PATCH 11/30] Reorganize & reformat Signed-off-by: Jan Lasek --- .../conf/megatron_llama_quantization.yaml | 14 ++-- .../megatron_llama_quantization.py | 15 ++-- nemo/export/quantize/quantizer.py | 79 +++++++++++++------ tests/setup/__main__.py | 3 +- tests/setup/models/create_hf_model.py | 16 +++- 5 files changed, 88 insertions(+), 39 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml index 5603aa9c92ba..b374daa3a638 100644 --- a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml +++ b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml @@ -24,12 +24,14 @@ quantization: quantize_bmm1: false algorithm: fp8 # int8_sq, fp8, int8, int4_awq calib_dataset: cnn_dailymail # pileval, wikitext, cnn_dailymail - num_calib_size: 128 # number of samples used for calibration + num_calib_size: 512 # number of samples used for calibration +export: + decoder_type: llama # gptnext, gpt2, llama + inference_tensor_parallel: 1 # Default using 1 TP for inference + dtype: 16 # Default precision data type + +model_file: llama2-7b-fp16.nemo # Nemo file path +model_save: llama2-7b-fp8.qnemo # Path where the quantized model will be saved tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 -decoder_type: llama # gptnext, llama -model_file: llama2-7b-fp16.nemo # nemo file path -model_save_path: llama2-7b-fp16.qnemo # Path where the quantized model will be saved -inference_tensor_parallel: 1 # Default using 1 TP for inference -dtype: 16 # Default precision data type diff --git a/examples/nlp/language_modeling/megatron_llama_quantization.py b/examples/nlp/language_modeling/megatron_llama_quantization.py index 5565900f901b..ebc9897d1fb7 100644 --- a/examples/nlp/language_modeling/megatron_llama_quantization.py +++ b/examples/nlp/language_modeling/megatron_llama_quantization.py @@ -32,9 +32,10 @@ ``` python examples/nlp/language_modeling/megatron_llama_quantization.py \ model_file=llama2-7b-fp16.nemo \ - decoder_type=llama \ - quantization.algorithm=int8_sq \ - model_save_path=llama2-7b-fp16.qnemo + model_save=llama2-7b-fp8.qnemo \ + quantization.algorithm=fp8 \ + export.decoder_type=llama \ + export.inference_tensor_parallel=1 ``` """ @@ -66,7 +67,7 @@ def main(cfg) -> None: if not torch.cuda.is_available(): raise EnvironmentError("GPU is required for the inference.") - quantizer = Quantizer(cfg.quantization, cfg.inference, cfg.trainer) + quantizer = Quantizer(cfg.quantization, cfg.inference, cfg.export, cfg.trainer) dataloader = get_calib_dataloader( cfg.quantization.calib_dataset, @@ -76,9 +77,11 @@ def main(cfg) -> None: ) dataloader = [data for data in dataloader] - model = quantizer.quantize(cfg.model_file, dataloader, cfg.tensor_model_parallel_size) + model = quantizer.quantize( + cfg.model_file, dataloader, cfg.tensor_model_parallel_size, cfg.pipeline_model_parallel_size + ) - quantizer.export(model, cfg.model_save_path, cfg.decoder_type, cfg.dtype, cfg.inference_tensor_parallel) + quantizer.export(model, cfg.model_save) if __name__ == '__main__': diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index 4f3ea26513cf..f17978c3e87e 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -15,12 +15,11 @@ import copy import os import tarfile -from typing import Optional +from typing import List, Optional import ammo.torch.quantization as atq import torch.distributed as dist from ammo.torch.export import export_model_config -from ammo.torch.utils import print_rank_0 from megatron.core import parallel_state from omegaconf import OmegaConf from omegaconf.omegaconf import DictConfig, open_dict @@ -42,6 +41,8 @@ "w4a8_awq": atq.W4A8_AWQ_BETA_CFG, } +SUPPORTED_DTYPE = [16, "16", "bf16"] # Default precision for non-quantized layers + class Quantizer: @@ -64,16 +65,24 @@ class Quantizer: model families is experimental and might not be fully supported. Available quantization methods are listed in QUANT_CFG_CHOICES dictionary on top of this file. - Please consult AMMO docummentation for details. You can also ispect different choices in + Please consult AMMO documentation for details. You can also inspect different choices in examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml for quantization algorithms and calibration data as well as recommended settings. """ - def __init__(self, quantization_config: DictConfig, inference_config: DictConfig, trainer_config: DictConfig): + def __init__( + self, + quantization_config: DictConfig, + inference_config: DictConfig, + export_config: DictConfig, + trainer_config: DictConfig, + ): + assert export_config.dtype in SUPPORTED_DTYPE + assert quantization_config.algorithm in QUANT_CFG_CHOICES self.quantization_config = quantization_config self.inference_config = inference_config + self.export_config = export_config self.trainer_config = trainer_config - assert self.quantization_config.algorithm in QUANT_CFG_CHOICES atq_config = QUANT_CFG_CHOICES[quantization_config.algorithm] if quantization_config.algorithm != "fp8": # disable quantization for the last output layer @@ -81,17 +90,27 @@ def __init__(self, quantization_config: DictConfig, inference_config: DictConfig atq_config["quant_cfg"]["*.output_layer.*"] = {"enable": False} self.atq_config = atq_config - def _load_model(self, model_file, tensor_model_parallel_size: Optional[int] = None): + def _load_model( + self, + model_file: str, + tensor_model_parallel_size: Optional[int] = None, + pipeline_model_parallel_size: Optional[int] = None, + ): trainer = Trainer(strategy=NLPDDPStrategy(), **self.trainer_config) connector = NLPSaveRestoreConnector() if os.path.isdir(model_file): connector.model_extracted_dir = model_file - model_cfg = self._restore_and_modify_config(model_file, trainer, connector, tensor_model_parallel_size) + model_cfg = self._restore_and_modify_config( + model_file, trainer, connector, tensor_model_parallel_size, pipeline_model_parallel_size + ) model = MegatronGPTModel.restore_from( - restore_path=model_file, trainer=trainer, override_config_path=model_cfg, save_restore_connector=connector, + restore_path=model_file, + trainer=trainer, + override_config_path=model_cfg, + save_restore_connector=connector, ) model.freeze() @@ -99,7 +118,10 @@ def _load_model(self, model_file, tensor_model_parallel_size: Optional[int] = No model.model.module.language_model.encoder.activations_checkpoint_method = None except AttributeError: pass - print_rank_0(model) + + if is_global_rank_zero(): + print(model) + self._check_ddp_initialized(model) return model @@ -119,47 +141,60 @@ def _restore_and_modify_config( trainer: Trainer, connector: NLPSaveRestoreConnector, tensor_model_parallel_size: Optional[int] = None, + pipeline_model_parallel_size: Optional[int] = None, ): model_cfg = MegatronGPTModel.restore_from( - restore_path=model_file, trainer=trainer, save_restore_connector=connector, return_config=True, + restore_path=model_file, + trainer=trainer, + save_restore_connector=connector, + return_config=True, ) with open_dict(model_cfg): model_cfg.activations_checkpoint_method = None model_cfg.activations_checkpoint_granularity = None if tensor_model_parallel_size is not None: model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.name = "ammo" # Model needs to be loaded in "ammo" layer spec + if pipeline_model_parallel_size is not None: + model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size + # Only custom AMMO spec is supported for PTQ: this custom spec is largely based on local Megatron-LM + # layer definitions to avoid Transformer Engine implementations that are currently not supported. + model_cfg.name = "ammo" return model_cfg - def quantize(self, model_file: str, dataloader, tensor_model_parallel_size: Optional[int] = None): - model = self._load_model(model_file, tensor_model_parallel_size) + def quantize( + self, + model_file: str, + dataloader: List[List[str]], + tensor_model_parallel_size: Optional[int] = None, + pipeline_model_parallel_size: Optional[int] = None, + ): + model = self._load_model(model_file, tensor_model_parallel_size, pipeline_model_parallel_size) model.set_inference_config(OmegaConf.to_container(self.inference_config)) def forward_loop(): for i, batch in enumerate(dataloader): - print_rank_0(f"Calibrating batch {i}") + if is_global_rank_zero(): + print(f"Calibrating batch {i}") model.predict_step(batch, i) atq.quantize(model, self.atq_config, forward_loop) return model - def export(self, model, output_file: str, decoder_type: str, dtype: str, inference_tensor_parallel: int): - supported_dtype = [16, "16", "bf16"] # FIXME: Move that to top - assert dtype in supported_dtype, f"{dtype} not supported. Supported dtypes are {supported_dtype}" - torch_dtype = torch_dtype_from_precision(dtype) + def export(self, model, model_save: str): + torch_dtype = torch_dtype_from_precision(self.export_config.dtype) with temporary_directory() as tmp_dir: export_model_config( model=model, - decoder_type=decoder_type, + decoder_type=self.export_config.decoder_type, dtype=torch_dtype, export_dir=tmp_dir, - inference_tensor_parallel=inference_tensor_parallel, + inference_tensor_parallel=self.export_config.inference_tensor_parallel, ) dist.barrier() # Wait until all ranks complete export_model_config step if is_global_rank_zero(): - logging.info(f"Exporting quantized weights, tokenizer config, and model artifacts to {output_file}...") - with tarfile.open(output_file, "w:gz") as tar: + logging.info(f"Exporting quantized weights, model artifacts, and tokenizer config to {model_save}...") + with tarfile.open(model_save, "w:gz") as tar: save_artifacts(model, tmp_dir) tar.add(tmp_dir, arcname="./") diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py index 51cdab795a99..c9f33cf73996 100644 --- a/tests/setup/__main__.py +++ b/tests/setup/__main__.py @@ -30,7 +30,8 @@ os.makedirs(args.save_dir, exist_ok=True) create_sample_jsonl( - output_file=os.path.join(args.save_dir, "test_quantization", "test.json"), overwrite=args.overwrite, + output_file=os.path.join(args.save_dir, "test_quantization", "test.json"), + overwrite=args.overwrite, ) create_hf_model( diff --git a/tests/setup/models/create_hf_model.py b/tests/setup/models/create_hf_model.py index 9f57d5996dfc..fd6c1bdb0277 100644 --- a/tests/setup/models/create_hf_model.py +++ b/tests/setup/models/create_hf_model.py @@ -79,16 +79,24 @@ def create_hf_model( if __name__ == "__main__": parser = argparse.ArgumentParser("Create a HuggingFace model (random initialization) for testing purposes.") parser.add_argument( - "--model_name_or_path", required=True, help="Model name or local path with model config and tokenizer", + "--model_name_or_path", + required=True, + help="Model name or local path with model config and tokenizer", ) parser.add_argument( - "--output_dir", required=True, help="Output directory", + "--output_dir", + required=True, + help="Output directory", ) parser.add_argument( - "--config_updates", type=json.loads, help="Parameter updates in JSON format to overwrite for model config", + "--config_updates", + type=json.loads, + help="Parameter updates in JSON format to overwrite for model config", ) parser.add_argument( - "--overwrite", action="store_true", help="Overwrite file if it exists", + "--overwrite", + action="store_true", + help="Overwrite file if it exists", ) args = parser.parse_args() create_hf_model(args.model_name_or_path, args.output_dir, args.config_updates) From e03cb87088c60bbb05cc7e53598a2fd748a9249c Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Mon, 4 Mar 2024 10:50:25 +0100 Subject: [PATCH 12/30] Tests for FP8 and INT4 AWQ Signed-off-by: Jan Lasek --- Jenkinsfile | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 8902151c603f..5af6fc147dee 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -453,15 +453,39 @@ pipeline { } failFast true parallel { - stage('Llama') { + stage('Llama2 - INT8 SQ') { steps { - sh 'CUDA_VISIBLE_DEVICES=0 python examples/nlp/language_modeling/megatron_llama_quantization.py \ + sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ model_file=/home/TestData/nlp/megatron_llama/ci.nemo \ quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ quantization.algorithm=int8_sq \ quantization.num_calib_size=8 \ inference.batch_size=2 \ - model_save_path=/home/TestData/nlp/megatron_llama/ci.qnemo' + model_save=/home/TestData/nlp/megatron_llama/ci.qnemo' + sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo' + } + } + stage('Llama2 - FP8') { + steps { + sh 'mpirun -n 2 --allow-run-as-root python examples/nlp/language_modeling/megatron_llama_quantization.py \ + model_file=/home/TestData/nlp/megatron_llama/ci.nemo \ + quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ + quantization.algorithm=fp8 \ + quantization.num_calib_size=8 \ + inference.batch_size=2 \ + model_save=/home/TestData/nlp/megatron_llama/ci.qnemo' + sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo' + } + } + stage('Llama2 - INT4 AWQ') { + steps { + sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ + model_file=/home/TestData/nlp/megatron_llama/ci.nemo \ + quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ + quantization.algorithm=int4_awq \ + quantization.num_calib_size=8 \ + inference.batch_size=2 \ + model_save=/home/TestData/nlp/megatron_llama/ci.qnemo' sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo' } } From 4b49ec6b2bd73fe7daba3d8bccf08023b0c1a536 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Mar 2024 10:07:27 +0000 Subject: [PATCH 13/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo/export/quantize/quantizer.py | 10 ++-------- tests/setup/__main__.py | 3 +-- tests/setup/models/create_hf_model.py | 16 ++++------------ 3 files changed, 7 insertions(+), 22 deletions(-) diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index f17978c3e87e..54f0657a44b7 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -107,10 +107,7 @@ def _load_model( ) model = MegatronGPTModel.restore_from( - restore_path=model_file, - trainer=trainer, - override_config_path=model_cfg, - save_restore_connector=connector, + restore_path=model_file, trainer=trainer, override_config_path=model_cfg, save_restore_connector=connector, ) model.freeze() @@ -144,10 +141,7 @@ def _restore_and_modify_config( pipeline_model_parallel_size: Optional[int] = None, ): model_cfg = MegatronGPTModel.restore_from( - restore_path=model_file, - trainer=trainer, - save_restore_connector=connector, - return_config=True, + restore_path=model_file, trainer=trainer, save_restore_connector=connector, return_config=True, ) with open_dict(model_cfg): model_cfg.activations_checkpoint_method = None diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py index c9f33cf73996..51cdab795a99 100644 --- a/tests/setup/__main__.py +++ b/tests/setup/__main__.py @@ -30,8 +30,7 @@ os.makedirs(args.save_dir, exist_ok=True) create_sample_jsonl( - output_file=os.path.join(args.save_dir, "test_quantization", "test.json"), - overwrite=args.overwrite, + output_file=os.path.join(args.save_dir, "test_quantization", "test.json"), overwrite=args.overwrite, ) create_hf_model( diff --git a/tests/setup/models/create_hf_model.py b/tests/setup/models/create_hf_model.py index fd6c1bdb0277..9f57d5996dfc 100644 --- a/tests/setup/models/create_hf_model.py +++ b/tests/setup/models/create_hf_model.py @@ -79,24 +79,16 @@ def create_hf_model( if __name__ == "__main__": parser = argparse.ArgumentParser("Create a HuggingFace model (random initialization) for testing purposes.") parser.add_argument( - "--model_name_or_path", - required=True, - help="Model name or local path with model config and tokenizer", + "--model_name_or_path", required=True, help="Model name or local path with model config and tokenizer", ) parser.add_argument( - "--output_dir", - required=True, - help="Output directory", + "--output_dir", required=True, help="Output directory", ) parser.add_argument( - "--config_updates", - type=json.loads, - help="Parameter updates in JSON format to overwrite for model config", + "--config_updates", type=json.loads, help="Parameter updates in JSON format to overwrite for model config", ) parser.add_argument( - "--overwrite", - action="store_true", - help="Overwrite file if it exists", + "--overwrite", action="store_true", help="Overwrite file if it exists", ) args = parser.parse_args() create_hf_model(args.model_name_or_path, args.output_dir, args.config_updates) From 90988b141a75b06ddb25aeca5bfaa3563d25ee1d Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Mon, 4 Mar 2024 11:19:04 +0100 Subject: [PATCH 14/30] Add load_config helper function Signed-off-by: Jan Lasek --- nemo/export/quantize/quantizer.py | 25 +++++++++++-------------- nemo/utils/model_utils.py | 13 +++++++++++++ 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index 54f0657a44b7..571d398850bc 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -31,7 +31,7 @@ from nemo.utils import logging from nemo.utils.distributed import temporary_directory from nemo.utils.get_rank import is_global_rank_zero -from nemo.utils.model_utils import save_artifacts +from nemo.utils.model_utils import load_config, save_artifacts QUANT_CFG_CHOICES = { "int8": atq.INT8_DEFAULT_CFG, @@ -96,16 +96,14 @@ def _load_model( tensor_model_parallel_size: Optional[int] = None, pipeline_model_parallel_size: Optional[int] = None, ): + """Load model using AMMO layer spec for quantization.""" + model_cfg = self._load_and_modify_config( + model_file, tensor_model_parallel_size, pipeline_model_parallel_size + ) + trainer = Trainer(strategy=NLPDDPStrategy(), **self.trainer_config) connector = NLPSaveRestoreConnector() - if os.path.isdir(model_file): - connector.model_extracted_dir = model_file - - model_cfg = self._restore_and_modify_config( - model_file, trainer, connector, tensor_model_parallel_size, pipeline_model_parallel_size - ) - model = MegatronGPTModel.restore_from( restore_path=model_file, trainer=trainer, override_config_path=model_cfg, save_restore_connector=connector, ) @@ -132,17 +130,14 @@ def dummy(): model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer) model.trainer.strategy.setup_environment() - def _restore_and_modify_config( + def _load_and_modify_config( self, model_file: str, - trainer: Trainer, - connector: NLPSaveRestoreConnector, tensor_model_parallel_size: Optional[int] = None, pipeline_model_parallel_size: Optional[int] = None, ): - model_cfg = MegatronGPTModel.restore_from( - restore_path=model_file, trainer=trainer, save_restore_connector=connector, return_config=True, - ) + model_cfg = load_config(model_file) + with open_dict(model_cfg): model_cfg.activations_checkpoint_method = None model_cfg.activations_checkpoint_granularity = None @@ -163,6 +158,7 @@ def quantize( tensor_model_parallel_size: Optional[int] = None, pipeline_model_parallel_size: Optional[int] = None, ): + """Quantize model checkpoint using given dataloader and optional custom parallelism settings.""" model = self._load_model(model_file, tensor_model_parallel_size, pipeline_model_parallel_size) model.set_inference_config(OmegaConf.to_container(self.inference_config)) @@ -176,6 +172,7 @@ def forward_loop(): return model def export(self, model, model_save: str): + """Export model to '.qnemo' format for TensorRT-LLM engine build.""" torch_dtype = torch_dtype_from_precision(self.export_config.dtype) with temporary_directory() as tmp_dir: diff --git a/nemo/utils/model_utils.py b/nemo/utils/model_utils.py index c7497511572a..8889f13d5b98 100644 --- a/nemo/utils/model_utils.py +++ b/nemo/utils/model_utils.py @@ -18,6 +18,7 @@ import os import shutil import tarfile +import tempfile from dataclasses import dataclass, is_dataclass from enum import Enum from functools import lru_cache @@ -64,6 +65,18 @@ class ArtifactItem: hashed_path: Optional[str] = None +def load_config(model_file: str) -> DictConfig: + """Load model config from extracted directory or '.nemo' tarball.""" + if os.path.isfile(model_file): + with tempfile.TemporaryDirectory() as tmp, tarfile.open(model_file, "r:") as tar: + tar.extract("./model_config.yaml", path=tmp) + model_config = OmegaConf.load(os.path.join(tmp, "model_config.yaml")) + else: + model_config = OmegaConf.load(os.path.join(model_file, "model_config.yaml")) + + return model_config + + def resolve_dataset_name_from_cfg(cfg: 'DictConfig') -> Optional[str]: """ Parses items of the provided sub-config to find the first potential key that From d115befde42778972612facd83ff43cddf410340 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Mar 2024 10:20:30 +0000 Subject: [PATCH 15/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo/export/quantize/quantizer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index 571d398850bc..998935cac214 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -97,9 +97,7 @@ def _load_model( pipeline_model_parallel_size: Optional[int] = None, ): """Load model using AMMO layer spec for quantization.""" - model_cfg = self._load_and_modify_config( - model_file, tensor_model_parallel_size, pipeline_model_parallel_size - ) + model_cfg = self._load_and_modify_config(model_file, tensor_model_parallel_size, pipeline_model_parallel_size) trainer = Trainer(strategy=NLPDDPStrategy(), **self.trainer_config) connector = NLPSaveRestoreConnector() From 6fcbcd06f48441d1a7fc7f2a05ec80dea729171c Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Mon, 4 Mar 2024 11:23:31 +0100 Subject: [PATCH 16/30] Unused import removal Signed-off-by: Jan Lasek --- nemo/export/quantize/quantizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index 998935cac214..dc044e52870a 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -13,7 +13,6 @@ # limitations under the License. import copy -import os import tarfile from typing import List, Optional From a5e818f08e4395e6df23795651a211efefdc6331 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Mon, 4 Mar 2024 12:39:13 +0100 Subject: [PATCH 17/30] Fix FP8 Jenkins test Signed-off-by: Jan Lasek --- Jenkinsfile | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 5af6fc147dee..a560f284d8a8 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -462,19 +462,21 @@ pipeline { quantization.num_calib_size=8 \ inference.batch_size=2 \ model_save=/home/TestData/nlp/megatron_llama/ci.qnemo' - sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo' + sh 'rm -f /home/TestData/nlp/megatron_llama/ci.qnemo' } } stage('Llama2 - FP8') { steps { sh 'mpirun -n 2 --allow-run-as-root python examples/nlp/language_modeling/megatron_llama_quantization.py \ model_file=/home/TestData/nlp/megatron_llama/ci.nemo \ + tensor_model_parallel_size=2 \ + trainer.devices=2 \ quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ quantization.algorithm=fp8 \ quantization.num_calib_size=8 \ inference.batch_size=2 \ model_save=/home/TestData/nlp/megatron_llama/ci.qnemo' - sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo' + sh 'rm -f /home/TestData/nlp/megatron_llama/ci.qnemo' } } stage('Llama2 - INT4 AWQ') { @@ -486,11 +488,12 @@ pipeline { quantization.num_calib_size=8 \ inference.batch_size=2 \ model_save=/home/TestData/nlp/megatron_llama/ci.qnemo' - sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo' + sh 'rm -f /home/TestData/nlp/megatron_llama/ci.qnemo' } } } } + stage('L2: ASR dev run') { when { anyOf { From 12f3717861f64791e2a924e9af9e57108ac1d764 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Tue, 5 Mar 2024 12:53:13 +0100 Subject: [PATCH 18/30] Fix TP=2 test cont'd: no need to use mpirun Signed-off-by: Jan Lasek --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index a560f284d8a8..ab4105fa219f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -467,7 +467,7 @@ pipeline { } stage('Llama2 - FP8') { steps { - sh 'mpirun -n 2 --allow-run-as-root python examples/nlp/language_modeling/megatron_llama_quantization.py \ + sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ model_file=/home/TestData/nlp/megatron_llama/ci.nemo \ tensor_model_parallel_size=2 \ trainer.devices=2 \ From a96be0f4d2347850dc222cbe0c8312d805feb003 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Tue, 5 Mar 2024 14:40:12 +0100 Subject: [PATCH 19/30] Allow for patches in AMMO versioning Signed-off-by: Jan Lasek --- requirements/requirements_nlp.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index 4bffca663d71..e613bf649692 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -12,7 +12,7 @@ markdown2 matplotlib>=3.3.2 megatron_core==0.5.0 nltk>=3.6.5 -nvidia-ammo==0.7.3 +nvidia-ammo~=0.7.3 opencc<1.1.7 pangu rapidfuzz From c99b99231e92539eec165596d2c22d1ebb2e7de7 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Tue, 5 Mar 2024 14:41:48 +0100 Subject: [PATCH 20/30] Drop AWQ test for now (need to debug) Signed-off-by: Jan Lasek --- Jenkinsfile | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index ab4105fa219f..6e550ef68673 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -479,18 +479,6 @@ pipeline { sh 'rm -f /home/TestData/nlp/megatron_llama/ci.qnemo' } } - stage('Llama2 - INT4 AWQ') { - steps { - sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ - model_file=/home/TestData/nlp/megatron_llama/ci.nemo \ - quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ - quantization.algorithm=int4_awq \ - quantization.num_calib_size=8 \ - inference.batch_size=2 \ - model_save=/home/TestData/nlp/megatron_llama/ci.qnemo' - sh 'rm -f /home/TestData/nlp/megatron_llama/ci.qnemo' - } - } } } From 2eff82fe66b545c37f69bf9ab00fff1c63366c35 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Tue, 5 Mar 2024 16:35:19 +0100 Subject: [PATCH 21/30] Allow for patches in AMMO versioning cont'd Signed-off-by: Jan Lasek --- requirements/requirements_nlp.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index e613bf649692..984a3aa45478 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -12,7 +12,7 @@ markdown2 matplotlib>=3.3.2 megatron_core==0.5.0 nltk>=3.6.5 -nvidia-ammo~=0.7.3 +nvidia-ammo~=0.7.0 opencc<1.1.7 pangu rapidfuzz From 739fe3032767128c85ba7d38c6a4ce7ce5d2925b Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Wed, 6 Mar 2024 10:20:11 +0100 Subject: [PATCH 22/30] Use AMMO spec from MCore as it has been published Signed-off-by: Jan Lasek --- Dockerfile | 2 +- Jenkinsfile | 2 +- .../conf/megatron_llama_quantization.yaml | 2 +- .../megatron_llama_quantization.py | 2 +- .../language_modeling/megatron/model_specs.py | 60 ------------------- .../language_modeling/megatron_gpt_model.py | 2 +- nemo/export/quantize/quantizer.py | 3 +- 7 files changed, 7 insertions(+), 66 deletions(-) delete mode 100644 nemo/collections/nlp/models/language_modeling/megatron/model_specs.py diff --git a/Dockerfile b/Dockerfile index de85b35bf253..81e8f7c73b7b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -66,7 +66,7 @@ WORKDIR /workspace/ # We leave it here in case we need to work off of a specific commit in main RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout ad53b1e38689a0ceed75ade7821f4e6c7554abb4 && \ + git checkout 36e9b6bf3d8034b10c9bbd9fc357c2df2bd1515c && \ pip install . # Performance optimizations for distributed optimizer: https://github.com/NVIDIA/apex/pull/1771 diff --git a/Jenkinsfile b/Jenkinsfile index 5a1aabf5559c..5a60530aa47e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -91,7 +91,7 @@ pipeline { steps { sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout 5f9c870f9f24b482509699d206a9dbb00958f6fc && \ + git checkout 36e9b6bf3d8034b10c9bbd9fc357c2df2bd1515c && \ pip install .' } } diff --git a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml index b374daa3a638..5a5e87eba7c4 100644 --- a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml +++ b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml @@ -9,7 +9,7 @@ inference: repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. compute_logprob: false # a flag used to compute logprob of all the input text, a very special case of running inference, default False - batch_size: 4 # batch size for inference + batch_size: 64 # batch size for inference max_context_length: 512 # max length of the context, input sequence will be truncated if it is longer than this trainer: diff --git a/examples/nlp/language_modeling/megatron_llama_quantization.py b/examples/nlp/language_modeling/megatron_llama_quantization.py index ebc9897d1fb7..7b6fe88ba4f6 100644 --- a/examples/nlp/language_modeling/megatron_llama_quantization.py +++ b/examples/nlp/language_modeling/megatron_llama_quantization.py @@ -40,7 +40,7 @@ """ -def get_calib_dataloader(data="cnn_dailymail", batch_size=4, calib_size=512, max_sequence_length=512): +def get_calib_dataloader(data="cnn_dailymail", batch_size=64, calib_size=512, max_sequence_length=512): if data == "pileval": dataset = load_dataset("json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train") text_column = "text" diff --git a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py deleted file mode 100644 index 006f5d730045..000000000000 --- a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# TODO: This spec will be defined in MCore>=0.6.0 and is temporary -try: - - from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add - from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear - from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules - from megatron.core.transformer.custom_layers.transformer_engine import TENorm - from megatron.core.transformer.dot_product_attention import DotProductAttention - from megatron.core.transformer.enums import AttnMaskType - from megatron.core.transformer.mlp import MLP, MLPSubmodules - from megatron.core.transformer.spec_utils import ModuleSpec - from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules - - HAVE_MEGATRON_CORE = True - -except (ImportError, ModuleNotFoundError): - - HAVE_MEGATRON_CORE = False - ModuleSpec = None - - -def get_gpt_layer_ammo_spec() -> ModuleSpec: - assert HAVE_MEGATRON_CORE - return ModuleSpec( - module=TransformerLayer, - submodules=TransformerLayerSubmodules( - input_layernorm=TENorm, - self_attention=ModuleSpec( - module=SelfAttention, - params={"attn_mask_type": AttnMaskType.causal}, - submodules=SelfAttentionSubmodules( - linear_qkv=ColumnParallelLinear, core_attention=DotProductAttention, linear_proj=RowParallelLinear, - ), - ), - self_attn_bda=get_bias_dropout_add, - pre_mlp_layernorm=TENorm, - mlp=ModuleSpec( - module=MLP, submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,), - ), - mlp_bda=get_bias_dropout_add, - sharded_state_dict_keys_map={ - 'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', - 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_', - }, - ), - ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index ae3def28df3d..2afbed322abc 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -44,7 +44,6 @@ get_gpt_full_te_layer_autocast_spec, ) from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel -from nemo.collections.nlp.models.language_modeling.megatron.model_specs import get_gpt_layer_ammo_spec from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel from nemo.collections.nlp.modules.common.megatron.build_model import build_model from nemo.collections.nlp.modules.common.megatron.module import Float16Module @@ -91,6 +90,7 @@ from megatron.core import InferenceParams, parallel_state, tensor_parallel from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset + from megatron.core.deploy.gpt.model_specs import get_gpt_layer_ammo_spec from megatron.core.models.gpt import GPTModel as MCoreGPTModel from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec from megatron.core.pipeline_parallel.schedules import get_forward_backward_func diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index dc044e52870a..8e4c2c4a1386 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -111,10 +111,11 @@ def _load_model( except AttributeError: pass + self._check_ddp_initialized(model) + if is_global_rank_zero(): print(model) - self._check_ddp_initialized(model) return model def _check_ddp_initialized(self, model): From ae0498dbccf89db58e66a20aee71dcb0e0d6abd4 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Fri, 8 Mar 2024 14:48:50 +0100 Subject: [PATCH 23/30] Make AMMO optional dependency and properly import guard it Signed-off-by: Jan Lasek --- Dockerfile | 4 +++- Jenkinsfile | 6 ++++++ nemo/export/quantize/quantizer.py | 29 ++++++++++++++++++----------- reinstall.sh | 2 +- requirements/requirements_nlp.txt | 1 - 5 files changed, 28 insertions(+), 14 deletions(-) diff --git a/Dockerfile b/Dockerfile index 81e8f7c73b7b..970c34a690d4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -126,12 +126,14 @@ RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_k2.sh); INSTALL WORKDIR /tmp/nemo ENV LHOTSE_REQUIRE_TORCHAUDIO=0 COPY requirements . -RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --extra-index-url https://pypi.nvidia.com --no-cache-dir -r $f; done +RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done # install flash attention RUN pip install flash-attn # install numba for latest containers RUN pip install numba>=0.57.1 +# install ammo +RUN pip install nvidia-ammo~=0.7.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir # copy nemo source into a scratch image FROM scratch as nemo-src diff --git a/Jenkinsfile b/Jenkinsfile index 5a60530aa47e..b67fb4ac6f74 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -96,6 +96,12 @@ pipeline { } } + stage('AMMO installation') { + steps { + sh 'pip install nvidia-ammo~=0.7.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir' + } + } + stage('PyTorch Lightning version') { steps { sh 'python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"' diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index 8e4c2c4a1386..962529661d4e 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -16,9 +16,7 @@ import tarfile from typing import List, Optional -import ammo.torch.quantization as atq import torch.distributed as dist -from ammo.torch.export import export_model_config from megatron.core import parallel_state from omegaconf import OmegaConf from omegaconf.omegaconf import DictConfig, open_dict @@ -32,15 +30,14 @@ from nemo.utils.get_rank import is_global_rank_zero from nemo.utils.model_utils import load_config, save_artifacts -QUANT_CFG_CHOICES = { - "int8": atq.INT8_DEFAULT_CFG, - "int8_sq": atq.INT8_SMOOTHQUANT_CFG, - "fp8": atq.FP8_DEFAULT_CFG, - "int4_awq": atq.INT4_AWQ_CFG, - "w4a8_awq": atq.W4A8_AWQ_BETA_CFG, -} +try: + import ammo.torch.quantization as atq + from ammo.torch.export import export_model_config + HAVE_AMMO = True -SUPPORTED_DTYPE = [16, "16", "bf16"] # Default precision for non-quantized layers +except (ImportError, ModuleNotFoundError) as e: + HAVE_AMMO = False + HAVE_AMMO_ERROR = e class Quantizer: @@ -63,7 +60,7 @@ class Quantizer: the quantization command with decoder_type parameter on exporting (see below). Quantizing other model families is experimental and might not be fully supported. - Available quantization methods are listed in QUANT_CFG_CHOICES dictionary on top of this file. + Available quantization methods are listed in QUANT_CFG_CHOICES dictionary below. Please consult AMMO documentation for details. You can also inspect different choices in examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml for quantization algorithms and calibration data as well as recommended settings. @@ -76,6 +73,16 @@ def __init__( export_config: DictConfig, trainer_config: DictConfig, ): + if not HAVE_AMMO: + raise RuntimeError("nvidia-ammo>=0.7 is needed to use Quantizer") from HAVE_AMMO_ERROR + QUANT_CFG_CHOICES = { + "int8": atq.INT8_DEFAULT_CFG, + "int8_sq": atq.INT8_SMOOTHQUANT_CFG, + "fp8": atq.FP8_DEFAULT_CFG, + "int4_awq": atq.INT4_AWQ_CFG, + "w4a8_awq": atq.W4A8_AWQ_BETA_CFG, + } + SUPPORTED_DTYPE = [16, "16", "bf16"] # Default precision for non-quantized layers assert export_config.dtype in SUPPORTED_DTYPE assert quantization_config.algorithm in QUANT_CFG_CHOICES self.quantization_config = quantization_config diff --git a/reinstall.sh b/reinstall.sh index a5004590c7c1..d64b56103dd3 100755 --- a/reinstall.sh +++ b/reinstall.sh @@ -34,7 +34,7 @@ else ${PIP} install build pytest-runner python -m build --no-isolation --wheel DIST_FILE=$(find ./dist -name "*.whl" | head -n 1) - ${PIP} install --extra-index-url https://pypi.nvidia.com "${DIST_FILE}[all]" + ${PIP} install "${DIST_FILE}[all]" fi echo 'All done!' diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index 984a3aa45478..2484328293e1 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -12,7 +12,6 @@ markdown2 matplotlib>=3.3.2 megatron_core==0.5.0 nltk>=3.6.5 -nvidia-ammo~=0.7.0 opencc<1.1.7 pangu rapidfuzz From b56ff60381b80d0add4456297dab0fb52b30cf1e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 8 Mar 2024 14:31:47 +0000 Subject: [PATCH 24/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo/export/quantize/quantizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index 962529661d4e..3114416be18b 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -33,6 +33,7 @@ try: import ammo.torch.quantization as atq from ammo.torch.export import export_model_config + HAVE_AMMO = True except (ImportError, ModuleNotFoundError) as e: From 01f215d6b6c10a2d0f7267e89816ba148f12bb88 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Tue, 12 Mar 2024 09:49:07 +0100 Subject: [PATCH 25/30] Add Llama2 AWQ test and update some paths Signed-off-by: Jan Lasek --- Jenkinsfile | 28 +++++++++++++++++++++------- tests/setup/__main__.py | 6 +++--- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index b67fb4ac6f74..67b70516ed60 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -417,7 +417,7 @@ pipeline { steps { sh 'CUDA_VISIBLE_DEVICES=0 python scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py \ --in-file=/home/TestData/nlp/megatron_llama/llama-ci-hf \ - --out-file=/home/TestData/nlp/megatron_llama/ci.nemo \ + --out-file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \ --precision=16' } } @@ -462,27 +462,41 @@ pipeline { stage('Llama2 - INT8 SQ') { steps { sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ - model_file=/home/TestData/nlp/megatron_llama/ci.nemo \ + model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \ quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ quantization.algorithm=int8_sq \ quantization.num_calib_size=8 \ inference.batch_size=2 \ - model_save=/home/TestData/nlp/megatron_llama/ci.qnemo' - sh 'rm -f /home/TestData/nlp/megatron_llama/ci.qnemo' + model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo' + sh 'rm -f /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo' } } stage('Llama2 - FP8') { steps { sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ - model_file=/home/TestData/nlp/megatron_llama/ci.nemo \ + model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \ tensor_model_parallel_size=2 \ trainer.devices=2 \ quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ quantization.algorithm=fp8 \ quantization.num_calib_size=8 \ inference.batch_size=2 \ - model_save=/home/TestData/nlp/megatron_llama/ci.qnemo' - sh 'rm -f /home/TestData/nlp/megatron_llama/ci.qnemo' + model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo' + sh 'rm -f /home/TestData/nlp/megatron_llama/ci_fp8.qnemo' + } + } + stage('Llama2 - AWQ') { + steps { + sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ + model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \ + tensor_model_parallel_size=2 \ + trainer.devices=2 \ + quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ + quantization.algorithm=int4_awq \ + quantization.num_calib_size=8 \ + inference.batch_size=2 \ + model_save=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo' + sh 'rm -f /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo' } } } diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py index 51cdab795a99..289a2537e2f2 100644 --- a/tests/setup/__main__.py +++ b/tests/setup/__main__.py @@ -34,9 +34,9 @@ ) create_hf_model( - model_name_or_path="/home/TestData/nlp/megatron_llama/llama-ci-hf", # FIXME: change to "meta-llama/Llama-2-7b-hf" - output_dir=os.path.join(args.save_dir, "tiny_llama2_hf"), - config_updates={"hidden_size": 128, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4}, + model_name_or_path="/home/TestData/nlp/meta-llama/Llama-2-7b-hf", + output_dir=os.path.join(args.save_dir, "megatron_llama/llama-ci-hf"), + config_updates={"hidden_size": 256, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4}, overwrite=args.overwrite, ) print("Setup done.") From fe1eeba84af7c4ac6f2be9131c91401c907a1216 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Tue, 12 Mar 2024 13:08:12 +0100 Subject: [PATCH 26/30] Enable specifying quantization.algorithm=null for baseline accuracy checks Signed-off-by: Jan Lasek --- Jenkinsfile | 11 +++++++- .../conf/megatron_llama_quantization.yaml | 2 +- .../megatron_llama_quantization.py | 19 ++++++++----- nemo/export/quantize/quantizer.py | 28 +++++++++++++------ 4 files changed, 42 insertions(+), 18 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 67b70516ed60..15b04a34b12c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -459,6 +459,15 @@ pipeline { } failFast true parallel { + stage('Llama2 - Export Only') { + steps { + sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ + model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \ + quantization.algorithm=null \ + model_save=/home/TestData/nlp/megatron_llama/ci_baseline.qnemo' + sh 'rm -f /home/TestData/nlp/megatron_llama/ci_baseline.qnemo' + } + } stage('Llama2 - INT8 SQ') { steps { sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ @@ -485,7 +494,7 @@ pipeline { sh 'rm -f /home/TestData/nlp/megatron_llama/ci_fp8.qnemo' } } - stage('Llama2 - AWQ') { + stage('Llama2 - INT4 AWQ') { steps { sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \ diff --git a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml index 5a5e87eba7c4..322ab946febe 100644 --- a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml +++ b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml @@ -22,7 +22,7 @@ trainer: quantization: quantize_bmm1: false - algorithm: fp8 # int8_sq, fp8, int8, int4_awq + algorithm: fp8 # int8_sq, fp8, int8, int4_awq, null calib_dataset: cnn_dailymail # pileval, wikitext, cnn_dailymail num_calib_size: 512 # number of samples used for calibration diff --git a/examples/nlp/language_modeling/megatron_llama_quantization.py b/examples/nlp/language_modeling/megatron_llama_quantization.py index 7b6fe88ba4f6..16fb5ae9c13b 100644 --- a/examples/nlp/language_modeling/megatron_llama_quantization.py +++ b/examples/nlp/language_modeling/megatron_llama_quantization.py @@ -69,13 +69,18 @@ def main(cfg) -> None: quantizer = Quantizer(cfg.quantization, cfg.inference, cfg.export, cfg.trainer) - dataloader = get_calib_dataloader( - cfg.quantization.calib_dataset, - cfg.inference.batch_size, - cfg.quantization.num_calib_size, - cfg.inference.max_context_length, - ) - dataloader = [data for data in dataloader] + # Quantization algorithm can be set to None. This is useful for baseline precision + # accuracy validation. In this case only weights export step will be performed: + if cfg.quantization.algorithm is not None: + dataloader = get_calib_dataloader( + cfg.quantization.calib_dataset, + cfg.inference.batch_size, + cfg.quantization.num_calib_size, + cfg.inference.max_context_length, + ) + dataloader = [data for data in dataloader] + else: + dataloader = None model = quantizer.quantize( cfg.model_file, dataloader, cfg.tensor_model_parallel_size, cfg.pipeline_model_parallel_size diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index 3114416be18b..591848fd0adf 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -65,6 +65,9 @@ class Quantizer: Please consult AMMO documentation for details. You can also inspect different choices in examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml for quantization algorithms and calibration data as well as recommended settings. + + Quantization algorithm can also be conveniently set to 'null' to perform only weights export step + for TensorRT-LLM deployment. This is useful to getting baseline results for a full-precision model. """ def __init__( @@ -85,17 +88,20 @@ def __init__( } SUPPORTED_DTYPE = [16, "16", "bf16"] # Default precision for non-quantized layers assert export_config.dtype in SUPPORTED_DTYPE - assert quantization_config.algorithm in QUANT_CFG_CHOICES + assert quantization_config.algorithm is None or quantization_config.algorithm in QUANT_CFG_CHOICES self.quantization_config = quantization_config self.inference_config = inference_config self.export_config = export_config self.trainer_config = trainer_config - atq_config = QUANT_CFG_CHOICES[quantization_config.algorithm] - if quantization_config.algorithm != "fp8": - # disable quantization for the last output layer - atq_config = copy.deepcopy(atq_config) - atq_config["quant_cfg"]["*.output_layer.*"] = {"enable": False} - self.atq_config = atq_config + if quantization_config.algorithm is not None: + atq_config = QUANT_CFG_CHOICES[quantization_config.algorithm] + if quantization_config.algorithm != "fp8": + # disable quantization for the last output layer + atq_config = copy.deepcopy(atq_config) + atq_config["quant_cfg"]["*.output_layer.*"] = {"enable": False} + self.atq_config = atq_config + else: + self.atq_config = None def _load_model( self, @@ -160,12 +166,16 @@ def _load_and_modify_config( def quantize( self, model_file: str, - dataloader: List[List[str]], + dataloader: Optional[List[List[str]]], tensor_model_parallel_size: Optional[int] = None, pipeline_model_parallel_size: Optional[int] = None, ): """Quantize model checkpoint using given dataloader and optional custom parallelism settings.""" model = self._load_model(model_file, tensor_model_parallel_size, pipeline_model_parallel_size) + + if self.quantization_config.algorithm is None: + return model + model.set_inference_config(OmegaConf.to_container(self.inference_config)) def forward_loop(): @@ -174,7 +184,7 @@ def forward_loop(): print(f"Calibrating batch {i}") model.predict_step(batch, i) - atq.quantize(model, self.atq_config, forward_loop) + model = atq.quantize(model, self.atq_config, forward_loop) return model def export(self, model, model_save: str): From 3a7f07ec5f8b8877f7103075b4c4e42a760ce7fa Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Tue, 12 Mar 2024 14:39:34 +0100 Subject: [PATCH 27/30] Enable exporting qnemo tarball or just to a directory Signed-off-by: Jan Lasek --- Jenkinsfile | 4 ++-- nemo/export/quantize/quantizer.py | 28 +++++++++++++++++++--------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 15b04a34b12c..46b8f30be703 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -464,8 +464,8 @@ pipeline { sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \ quantization.algorithm=null \ - model_save=/home/TestData/nlp/megatron_llama/ci_baseline.qnemo' - sh 'rm -f /home/TestData/nlp/megatron_llama/ci_baseline.qnemo' + model_save=/home/TestData/nlp/megatron_llama/ci_baseline' + sh 'rm -rf /home/TestData/nlp/megatron_llama/ci_baseline' } } stage('Llama2 - INT8 SQ') { diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index 591848fd0adf..c25536208874 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -14,6 +14,7 @@ import copy import tarfile +from contextlib import nullcontext from typing import List, Optional import torch.distributed as dist @@ -51,11 +52,11 @@ class Quantizer: 1. Loading a Nemo model from disk using appropriate parallelism strategy 2. Calibrating the model to obtain appropriate algorithm-specific scaling factors - 3. Producing .qnemo tarball with model config (JSON), quantized weights (safetensors) - and tokenizer config (yaml). + 3. Producing output directory or .qnemo tarball with model config (json), + quantized weights (safetensors) and tokenizer config (yaml). - The .qnemo file produced is intended consumed by TensorRT-LLM toolbox for inference. - This can be achieved using Nemo inference containers. + The output directory (or .qnemo file) produced is intended to be consumed by TensorRT-LLM toolbox + for efficient inference. This can be achieved using Nemo inference containers. Currently supported and tested model family is Llama2. Model type needs to be specified in the quantization command with decoder_type parameter on exporting (see below). Quantizing other @@ -191,17 +192,26 @@ def export(self, model, model_save: str): """Export model to '.qnemo' format for TensorRT-LLM engine build.""" torch_dtype = torch_dtype_from_precision(self.export_config.dtype) - with temporary_directory() as tmp_dir: + # Setup model export handling: temporary directory for + # '.qnemo' tarball or directly write to model_save + save_qnemo = model_save.endswith(".qnemo") + if save_qnemo: + export_handler = temporary_directory() + else: + export_handler = nullcontext(enter_result=model_save) + + with export_handler as export_dir: export_model_config( model=model, decoder_type=self.export_config.decoder_type, dtype=torch_dtype, - export_dir=tmp_dir, + export_dir=export_dir, inference_tensor_parallel=self.export_config.inference_tensor_parallel, ) dist.barrier() # Wait until all ranks complete export_model_config step if is_global_rank_zero(): logging.info(f"Exporting quantized weights, model artifacts, and tokenizer config to {model_save}...") - with tarfile.open(model_save, "w:gz") as tar: - save_artifacts(model, tmp_dir) - tar.add(tmp_dir, arcname="./") + save_artifacts(model, export_dir) + if save_qnemo: + with tarfile.open(model_save, "w:gz") as tar: + tar.add(export_dir, arcname="./") From ac52816ea6a7de0308277f988c050fdb6f0415c7 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Tue, 12 Mar 2024 21:51:55 +0100 Subject: [PATCH 28/30] Drop AWQ testing for now Signed-off-by: Jan Lasek --- Jenkinsfile | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 46b8f30be703..37e0a229bf74 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -494,20 +494,6 @@ pipeline { sh 'rm -f /home/TestData/nlp/megatron_llama/ci_fp8.qnemo' } } - stage('Llama2 - INT4 AWQ') { - steps { - sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ - model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \ - tensor_model_parallel_size=2 \ - trainer.devices=2 \ - quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ - quantization.algorithm=int4_awq \ - quantization.num_calib_size=8 \ - inference.batch_size=2 \ - model_save=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo' - sh 'rm -f /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo' - } - } } } From 81e8e0769a17158a2e5f43ea8cc05c20db8a2bd4 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Tue, 12 Mar 2024 21:52:45 +0100 Subject: [PATCH 29/30] Test case for export.inference_tensor_parallel=2 Signed-off-by: Jan Lasek --- Jenkinsfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Jenkinsfile b/Jenkinsfile index 37e0a229bf74..d2fae1fc687e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -490,6 +490,7 @@ pipeline { quantization.algorithm=fp8 \ quantization.num_calib_size=8 \ inference.batch_size=2 \ + export.inference_tensor_parallel=2 \ model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo' sh 'rm -f /home/TestData/nlp/megatron_llama/ci_fp8.qnemo' } From bf03390b178f4a94494ad977eba423acab329252 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Tue, 12 Mar 2024 21:57:41 +0100 Subject: [PATCH 30/30] Flag to export TRT-LLM config.json Signed-off-by: Jan Lasek --- .../nlp/language_modeling/conf/megatron_llama_quantization.yaml | 1 + nemo/export/quantize/quantizer.py | 1 + 2 files changed, 2 insertions(+) diff --git a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml index 322ab946febe..f3803dc4e69c 100644 --- a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml +++ b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml @@ -30,6 +30,7 @@ export: decoder_type: llama # gptnext, gpt2, llama inference_tensor_parallel: 1 # Default using 1 TP for inference dtype: 16 # Default precision data type + export_tensorrt_llm_config: true # export config to build TRT-LLM engine directly model_file: llama2-7b-fp16.nemo # Nemo file path model_save: llama2-7b-fp8.qnemo # Path where the quantized model will be saved diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index c25536208874..1ae375e6cfe7 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -207,6 +207,7 @@ def export(self, model, model_save: str): dtype=torch_dtype, export_dir=export_dir, inference_tensor_parallel=self.export_config.inference_tensor_parallel, + export_tensorrt_llm_config=self.export_config.export_tensorrt_llm_config, ) dist.barrier() # Wait until all ranks complete export_model_config step if is_global_rank_zero():