From e6b0db18fe832eb2ecab65b10a7200f99f6fd2f3 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Fri, 16 Feb 2024 18:15:50 +0100
Subject: [PATCH 01/30] AMMO integration with Llama2 PTQ example and tests

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 Dockerfile                                    |   2 +
 .../conf/megatron_llama_quantization.yaml     |  35 ++++
 .../megatron_llama_quantization.py            |  71 ++++++++
 .../language_modeling/megatron/model_specs.py |  41 +++++
 .../language_modeling/megatron_gpt_model.py   |   2 +
 nemo/export/__init__.py                       |   0
 nemo/export/quantize/__init__.py              |   1 +
 nemo/export/quantize/quantizer.py             | 158 ++++++++++++++++++
 nemo/export/quantize/utils_wip.py             |  62 +++++++
 tests/setup/__main__.py                       |  31 ++++
 tests/setup/data/create_sample_jsonl.py       |  44 +++++
 tests/setup/models/create_hf_model.py         |  88 ++++++++++
 12 files changed, 535 insertions(+)
 create mode 100644 examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
 create mode 100644 examples/nlp/language_modeling/megatron_llama_quantization.py
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/model_specs.py
 create mode 100644 nemo/export/__init__.py
 create mode 100644 nemo/export/quantize/__init__.py
 create mode 100644 nemo/export/quantize/quantizer.py
 create mode 100644 nemo/export/quantize/utils_wip.py
 create mode 100644 tests/setup/__main__.py
 create mode 100644 tests/setup/data/create_sample_jsonl.py
 create mode 100644 tests/setup/models/create_hf_model.py

diff --git a/Dockerfile b/Dockerfile
index 90c84ea07627..da3f1100be2f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -132,6 +132,8 @@ RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-chec
 RUN pip install flash-attn
 # install numba for latest containers
 RUN pip install numba>=0.57.1
+# install AMMO  # TODO: add to requirements
+RUN pip install nvidia-ammo==0.7.2 --extra-index-url https://pypi.nvidia.com --no-cache-dir
 
 # copy nemo source into a scratch image
 FROM scratch as nemo-src
diff --git a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
new file mode 100644
index 000000000000..5603aa9c92ba
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
@@ -0,0 +1,35 @@
+inference:
+  greedy: false # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  add_BOS: true # add the bos token at the begining of the prompt
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  all_probs: false  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: false  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  batch_size: 4 # batch size for inference
+  max_context_length: 512 # max length of the context, input sequence will be truncated if it is longer than this
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: false # logger provided by exp_manager
+  precision: bf16 # 16, 32, or bf16
+  enable_checkpointing: false
+
+quantization:
+  quantize_bmm1: false
+  algorithm: fp8 # int8_sq, fp8, int8, int4_awq
+  calib_dataset: cnn_dailymail # pileval, wikitext, cnn_dailymail
+  num_calib_size: 128 # number of samples used for calibration
+
+tensor_model_parallel_size: 1
+pipeline_model_parallel_size: 1
+decoder_type: llama # gptnext, llama
+model_file: llama2-7b-fp16.nemo # nemo file path
+model_save_path: llama2-7b-fp16.qnemo # Path where the quantized model will be saved
+inference_tensor_parallel: 1 # Default using 1 TP for inference
+dtype: 16 # Default precision data type
diff --git a/examples/nlp/language_modeling/megatron_llama_quantization.py b/examples/nlp/language_modeling/megatron_llama_quantization.py
new file mode 100644
index 000000000000..fcb8fa0e86d5
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_llama_quantization.py
@@ -0,0 +1,71 @@
+import torch
+import torch.multiprocessing as mp
+from datasets import load_dataset
+
+from nemo.core.config import hydra_runner
+from nemo.export.quantize import Quantizer
+
+mp.set_start_method("spawn", force=True)
+
+"""
+Nemo quantization example script.
+
+Please consult nemo.export.quantize.Quantizer class
+and examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml config on available quantization methods,
+models supported as well as how to set up data and inference for calibration (with defaults recommended).
+
+Example usage:
+```
+python examples/nlp/language_modeling/megatron_llama_quantization.py \
+    model_file=llama2-7b-fp16.nemo \
+    decoder_type=llama \
+    quantization.algorithm=int8_sq \
+    model_save_path=llama2-7b-fp16.qnemo
+```
+"""
+
+
+def get_calib_dataloader(data="cnn_dailymail", batch_size=4, calib_size=512, max_sequence_length=512):
+    if data == "pileval":
+        dataset = load_dataset("json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train")
+        text_column = "text"
+    elif data == "wikitext":
+        dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
+        text_column = "text"
+    elif data == "cnn_dailymail":
+        dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
+        text_column = "article"
+    else:
+        # Assume a local JSON dataset with a column named "text"
+        dataset = load_dataset("json", data_files=data, split="train")
+        text_column = "text"
+    calib_size = max(min(len(dataset), calib_size), batch_size)
+    for i in range(calib_size // batch_size):
+        batch = dataset[i * batch_size : (i + 1) * batch_size][text_column]
+        for j in range(len(batch)):
+            batch[j] = batch[j][:max_sequence_length]
+        yield batch
+
+
+@hydra_runner(config_path="conf", config_name="megatron_llama_quantization")
+def main(cfg) -> None:
+    if not torch.cuda.is_available():
+        raise EnvironmentError("GPU is required for the inference.")
+
+    quantizer = Quantizer(cfg.quantization, cfg.inference, cfg.trainer)
+
+    dataloader = get_calib_dataloader(
+        cfg.quantization.calib_dataset,
+        cfg.inference.batch_size,
+        cfg.quantization.num_calib_size,
+        cfg.inference.max_context_length,
+    )
+    dataloader = [data for data in dataloader]
+
+    model = quantizer.quantize(cfg.model_file, dataloader, cfg.tensor_model_parallel_size)
+
+    quantizer.export(model, cfg.model_save_path, cfg.decoder_type, cfg.dtype, cfg.inference_tensor_parallel)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py
new file mode 100644
index 000000000000..8b9b6868b32c
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py
@@ -0,0 +1,41 @@
+# TODO: This will be a part of MCore
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.custom_layers.transformer_engine import TENorm
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+
+def get_gpt_layer_ammo_spec() -> ModuleSpec:
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=TENorm,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=TENorm,
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,
+                ),
+            ),
+            mlp_bda=get_bias_dropout_add,
+            sharded_state_dict_keys_map={
+                'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+                'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+            },
+        ),
+    )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 5cd4ccf380eb..8c213b0d04f6 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -43,6 +43,7 @@
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_full_te_layer_autocast_spec import (
     get_gpt_full_te_layer_autocast_spec,
 )
+from nemo.collections.nlp.models.language_modeling.megatron.model_specs import get_gpt_layer_ammo_spec
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
 from nemo.collections.nlp.modules.common.megatron.build_model import build_model
@@ -139,6 +140,7 @@ def get_specs(spec_name, num_experts=None):
         "": get_gpt_layer_with_transformer_engine_spec(num_experts),
         "megatron_falcon_gpt": get_falcon_layer_spec(),
         "megatron_gpt_full_te_layer_autocast": get_gpt_full_te_layer_autocast_spec(),
+        "ammo": get_gpt_layer_ammo_spec(),
     }
     if spec_name not in name_spec_dict:
         raise ValueError(f"Spec name '{spec_name}' is not recognized.")
diff --git a/nemo/export/__init__.py b/nemo/export/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo/export/quantize/__init__.py b/nemo/export/quantize/__init__.py
new file mode 100644
index 000000000000..f89c700da6fe
--- /dev/null
+++ b/nemo/export/quantize/__init__.py
@@ -0,0 +1 @@
+from .quantizer import Quantizer
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
new file mode 100644
index 000000000000..e6d0b997d382
--- /dev/null
+++ b/nemo/export/quantize/quantizer.py
@@ -0,0 +1,158 @@
+import copy
+import os
+import tarfile
+from typing import Optional
+
+import ammo.torch.quantization as atq
+import torch.distributed as dist
+from ammo.torch.export import export_model_config
+from ammo.torch.utils import print_rank_0
+from megatron.core import parallel_state
+from omegaconf import OmegaConf
+from omegaconf.omegaconf import DictConfig, open_dict
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
+from nemo.utils import logging
+from nemo.utils.get_rank import is_global_rank_zero
+
+from .utils_wip import copy_artifacts, temporary_directory  # TODO: Find a good place for these utils
+
+QUANT_CFG_CHOICES = {
+    "int8": atq.INT8_DEFAULT_CFG,
+    "int8_sq": atq.INT8_SMOOTHQUANT_CFG,
+    "fp8": atq.FP8_DEFAULT_CFG,
+    "int4_awq": atq.INT4_AWQ_CFG,
+    "w4a8_awq": atq.W4A8_AWQ_BETA_CFG,
+}
+
+
+class Quantizer:
+
+    """
+    Post-training quantization of Nemo checkpoints.
+
+    PTQ converts selected model layers to low-precision format (e.g., INT4, FP8) for efficient serving.
+    The process consist of several steps:
+
+        1. Loading a Nemo model from disk using appropriate parallelism strategy
+        2. Calibrating the model to obtain appropriate algorithm-specific scaling factors
+        3. Producing .qnemo tarball with model config (JSON), quantized weights (safetensors)
+           and tokenizer config (yaml).
+
+    The .qnemo file produced is intended consumed by TensorRT-LLM toolbox for inference.
+    This can be achieved using Nemo inference containers.
+
+    Currently supported and tested model family is Llama2. Model type needs to be specified in
+    the quantization command with decoder_type parameter on exporting (see below). Quantizing other
+    model families is experimental and might not be fully supported.
+
+    Available quantization methods are listed in QUANT_CFG_CHOICES dictionary on top of this file.
+    Please consult AMMO docummentation for details. You can also ispect different choices in
+    examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml for quantization algorithms and
+    calibration data as well as recommended settings.
+    """
+
+    def __init__(self, quantization_config: DictConfig, inference_config: DictConfig, trainer_config: DictConfig):
+        self.quantization_config = quantization_config
+        self.inference_config = inference_config
+        self.trainer_config = trainer_config
+        assert self.quantization_config.algorithm in QUANT_CFG_CHOICES
+        atq_config = QUANT_CFG_CHOICES[quantization_config.algorithm]
+        if quantization_config.algorithm != "fp8":
+            # disable quantization for the last output layer
+            atq_config = copy.deepcopy(atq_config)
+            atq_config["quant_cfg"]["*.output_layer.*"] = {"enable": False}
+        self.atq_config = atq_config
+
+    def _load_model(self, model_file, tensor_model_parallel_size: Optional[int] = None):
+        trainer = Trainer(strategy=NLPDDPStrategy(), **self.trainer_config)
+        connector = NLPSaveRestoreConnector()
+
+        if os.path.isdir(model_file):
+            connector.model_extracted_dir = model_file
+
+        model_cfg = self._restore_and_modify_config(model_file, trainer, connector, tensor_model_parallel_size)
+
+        model = MegatronGPTModel.restore_from(
+            restore_path=model_file,
+            trainer=trainer,
+            override_config_path=model_cfg,
+            save_restore_connector=connector,
+        )
+        model.freeze()
+
+        try:
+            model.model.module.language_model.encoder.activations_checkpoint_method = None
+        except AttributeError:
+            pass
+        print_rank_0(model)
+        self._check_ddp_initialized(model)
+        return model
+
+    def _check_ddp_initialized(self, model):
+        if parallel_state.is_unitialized():
+
+            def dummy():
+                return
+
+            if model.trainer.strategy.launcher is not None:
+                model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
+            model.trainer.strategy.setup_environment()
+
+    def _restore_and_modify_config(
+        self,
+        model_file: str,
+        trainer: Trainer,
+        connector: NLPSaveRestoreConnector,
+        tensor_model_parallel_size: Optional[int] = None,
+    ):
+        model_cfg = MegatronGPTModel.restore_from(
+            restore_path=model_file,
+            trainer=trainer,
+            save_restore_connector=connector,
+            return_config=True,
+        )
+        with open_dict(model_cfg):
+            model_cfg.activations_checkpoint_method = None
+            model_cfg.activations_checkpoint_granularity = None
+            if tensor_model_parallel_size is not None:
+                model_cfg.tensor_model_parallel_size = tensor_model_parallel_size
+            model_cfg.name = "ammo"  # Model needs to be loaded in "ammo" layer spec
+
+        return model_cfg
+
+    def quantize(self, model_file: str, dataloader, tensor_model_parallel_size: Optional[int] = None):
+        model = self._load_model(model_file, tensor_model_parallel_size)
+        model.set_inference_config(OmegaConf.to_container(self.inference_config))
+
+        def forward_loop():
+            for i, batch in enumerate(dataloader):
+                print_rank_0(f"Calibrating batch {i}")
+                model.predict_step(batch, i)
+
+        atq.quantize(model, self.atq_config, forward_loop)
+        return model
+
+    def export(self, model, output_file: str, decoder_type: str, dtype: str, inference_tensor_parallel: int):
+        supported_dtype = [16, "16", "bf16"]  # FIXME: Move that to top
+        assert dtype in supported_dtype, f"{dtype} not supported. Supported dtypes are {supported_dtype}"
+        torch_dtype = torch_dtype_from_precision(dtype)
+
+        with temporary_directory() as tmp_dir:
+            export_model_config(
+                model=model,
+                decoder_type=decoder_type,
+                dtype=torch_dtype,
+                export_dir=tmp_dir,
+                inference_tensor_parallel=inference_tensor_parallel,
+            )
+            dist.barrier()  # Wait until all ranks complete export_model_config step
+            if is_global_rank_zero():
+                logging.info(f"Exporting quantized weights, tokenizer config, and model artifacts to {output_file}...")
+                with tarfile.open(output_file, "w:gz") as tar:
+                    config = copy_artifacts(model, tmp_dir)
+                    OmegaConf.save(config.tokenizer, os.path.join(tmp_dir, "tokenizer_config.yaml"))
+                    tar.add(tmp_dir, arcname="./")
diff --git a/nemo/export/quantize/utils_wip.py b/nemo/export/quantize/utils_wip.py
new file mode 100644
index 000000000000..58ef8f09769e
--- /dev/null
+++ b/nemo/export/quantize/utils_wip.py
@@ -0,0 +1,62 @@
+import contextlib
+import copy
+import os
+import shutil
+import tarfile
+import tempfile
+
+import torch
+import torch.distributed as dist
+from omegaconf import OmegaConf
+
+from nemo.utils.app_state import AppState
+from nemo.utils.get_rank import get_rank, is_global_rank_zero
+
+
+@contextlib.contextmanager
+def temporary_directory():
+    """Create a shared temporary directory across ranks in distributed setup.
+
+    This function assumes that the distributed setup has been already
+    correctly initialized. It is intended to be used only in single-node
+    setup so that all ranks can access the directory created."""
+
+    if is_global_rank_zero():
+        tmp_dir = [tempfile.TemporaryDirectory()]
+    else:
+        tmp_dir = [None]
+    torch.distributed.broadcast_object_list(tmp_dir)
+    print(f"[{get_rank()}] tmp_dir={tmp_dir}")  # TODO: remove debug print
+    yield tmp_dir[0].name
+    # We use barrier below to make sure that rank zero won't exit
+    # and delete tmp_dir while other ranks may still use it
+    dist.barrier()
+
+
+def copy_artifacts(model, output_dir: str):
+    """Copy all model artifacts to a given output directory and return modified config."""
+    app_state = AppState()
+    model_file = app_state.model_restore_path
+    model_config = copy.deepcopy(model.cfg)
+
+    # Setup model file handling context: directory or tarball
+    if os.path.isfile(model_file):
+        model_file_handler = tarfile.open
+        kwargs = {"name": model_file, "mode": "r:"}
+    elif os.path.isdir(model_file):
+        model_file_handler = contextlib.nullcontext
+        kwargs = {}
+    else:
+        raise FileNotFoundError(model_file)
+
+    # Copy or extract artifacts depending on the context
+    with model_file_handler(**kwargs) as maybe_tar:
+        for arti_name, arti_item in model.artifacts.items():
+            _, arti_file = arti_item.path.split("nemo:")
+            if maybe_tar is not None:
+                maybe_tar.extract(f"./{arti_file}", path=output_dir)
+            else:
+                shutil.copy(os.path.join(model_file, arti_file), output_dir)
+            # Update artifact path to basename
+            OmegaConf.update(model_config, arti_name, os.path.basename(arti_file))
+    return model_config
diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py
new file mode 100644
index 000000000000..5750122eb1be
--- /dev/null
+++ b/tests/setup/__main__.py
@@ -0,0 +1,31 @@
+import argparse
+import os
+
+from .data.create_sample_jsonl import create_sample_jsonl
+from .models.create_hf_model import create_hf_model
+
+print("Setup test data and models...")
+
+parser = argparse.ArgumentParser("Setup test data and models.")
+parser.add_argument("--data_dir", required=True, help="Root save directory for data")
+parser.add_argument("--model_dir", required=True, help="Root save directory for models")
+parser.add_argument("--overwrite", action="store_true", help="Overwrite existing files and directories")
+args = parser.parse_args()
+
+print(f"Arguments are: {vars(args)}")
+
+os.makedirs(args.data_dir, exist_ok=True)
+os.makedirs(args.model_dir, exist_ok=True)
+
+create_sample_jsonl(
+    os.path.join(args.data_dir, "test_quantization", "test.json"),
+    args.overwrite,
+)
+
+create_hf_model(
+    "meta-llama/Llama-2-7b-hf",
+    os.path.join(args.model_dir, "tiny_llama2_hf"),
+    {"hidden_size": 128, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4},
+    args.overwrite,
+)
+print("Setup done.")
diff --git a/tests/setup/data/create_sample_jsonl.py b/tests/setup/data/create_sample_jsonl.py
new file mode 100644
index 000000000000..ee9bd3b48f7e
--- /dev/null
+++ b/tests/setup/data/create_sample_jsonl.py
@@ -0,0 +1,44 @@
+import argparse
+import json
+import os
+
+"""
+Create sample JSONL file for functional testing. Each line contains a dictionary
+with a single element "text" for storing data.
+"""
+
+
+def create_sample_jsonl(output_file: str, overwrite: bool = False):
+    """Create sample JSONL."""
+    if os.path.isfile(output_file) and not overwrite:
+        print(f"File {output_file} exists and overwrite flag is not set so exiting.")
+        return
+
+    texts = [
+        "Sample data for functional tests",
+        "Once upon a time, in the middle of a dense forest, there was a small house, where lived a pretty little girl "
+        "named Little Red Riding Hood.",
+        "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore "
+        "magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea "
+        "commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat "
+        "nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit "
+        "anim id est laborum...",
+        "Next please!",
+        "¡H E L L O   W O R L D!",
+        "Korzystając z okazji chciałbym pozdrowić całą moją rodzinę i przyjaciół",
+    ]
+    print(f"Writing {len(texts)} line(s) to {output_file}...")
+    os.makedirs(os.path.dirname(output_file), exist_ok=True)
+    with open(output_file, mode="w", encoding="utf-8") as f:
+        for text in texts:
+            json.dump({"text": text}, f)
+            f.write("\n")
+    print("OK.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Create sample JSONL file.")
+    parser.add_argument("--output_file", help="Output file name")
+    parser.add_argument("--overwrite", action="store_true", help="Overwrite file if it exists")
+    args = parser.parse_args()
+    create_sample_jsonl(args.output_file)
diff --git a/tests/setup/models/create_hf_model.py b/tests/setup/models/create_hf_model.py
new file mode 100644
index 000000000000..5d40d9742628
--- /dev/null
+++ b/tests/setup/models/create_hf_model.py
@@ -0,0 +1,88 @@
+import argparse
+import json
+import os
+
+from typing import Any, Dict, Optional
+
+import transformers
+
+"""
+Create a randomly initialized HuggingFace model for testing purposes.
+
+Model can be specified by name or path for creating its config and tokenizer using
+HuggingFace transformers AutoConfig and AutoTokenizer functions.
+
+Parameter config_updates can be used to override specific model config fields to make
+it smaller, for example, by changing number of layers or hidden layers dimensionality,
+making it adequate for testing purposes. This parameter should be specified as
+a dictionary that can be parsed using json.loads method.
+
+Example usage for Llama2 model (requires HF login):
+```
+python tests/setup/models/create_tiny_hf_model.py \
+  --model_name_or_path meta-llama/Llama-2-7b-hf \
+  --output_dir tiny_llama2_hf \
+  --config_updates '{"hidden_size": 128, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4}'
+```
+"""
+
+
+def get_hf_model_class(hf_config):
+    """Get HuggingFace model class from config."""
+    if len(hf_config.architectures) > 1:
+        print(f"More than one model architecture available, choosing 1st: {hf_config.architectures}")
+    model_name = hf_config.architectures[0]
+    model_class = getattr(transformers, model_name)
+    return model_class
+
+
+def create_hf_model(
+    model_name_or_path: str, output_dir: str, config_updates: Optional[Dict[str, Any]] = None, overwrite: bool = False
+):
+    """Create HuggingFace model with optional config updates."""
+    if os.path.isdir(output_dir) and not overwrite:
+        print(f"Output directory {output_dir} exists and overwrite flag is not set so exiting.")
+        return
+
+    hf_config = transformers.AutoConfig.from_pretrained(model_name_or_path)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
+    model_class = get_hf_model_class(hf_config)
+
+    if config_updates is not None:
+        hf_config.update(config_updates)
+    print(hf_config)
+
+    model = model_class(hf_config)
+    print(model)
+
+    os.makedirs(output_dir, exist_ok=True)
+    print(f"Saving model to {output_dir}...")
+    tokenizer.save_pretrained(output_dir)
+    model.save_pretrained(output_dir)
+    print("OK.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Create a HuggingFace model (random initialization) for testing purposes.")
+    parser.add_argument(
+        "--model_name_or_path",
+        required=True,
+        help="Model name or local path with model config and tokenizer",
+    )
+    parser.add_argument(
+        "--output_dir",
+        required=True,
+        help="Output directory",
+    )
+    parser.add_argument(
+        "--config_updates",
+        type=json.loads,
+        help="Parameter updates in JSON format to overwrite for model config",
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite file if it exists",
+    )
+    args = parser.parse_args()
+    create_hf_model(args.model_name_or_path, args.output_dir, args.config_updates)

From 41b3f6d24870adfb638c787bcda2658ce11f23fb Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Fri, 16 Feb 2024 19:09:58 +0100
Subject: [PATCH 02/30] Jenkins megatron_llama_quantization.py test setup

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 Jenkinsfile             | 37 ++++++++++++++++++++++++++++++++++++-
 tests/setup/__main__.py | 18 ++++++++----------
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 0625f469ce11..8ab088482491 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -96,6 +96,13 @@ pipeline {
       }
     }
 
+    // TODO: AMMO installation - move to requirements
+    stage('AMMO installation') {
+      steps {
+         sh 'pip install nvidia-ammo==0.7.2 --extra-index-url https://pypi.nvidia.com --no-cache-dir'
+      }
+    }
+
     stage('PyTorch Lightning version') {
       steps {
         sh 'python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"'
@@ -390,6 +397,12 @@ pipeline {
       }
     }
 
+    stage('Setup test data and models') {
+      steps {
+        sh 'python -m tests.setup --save_dir /home/TestData/nlp'
+      }
+    }
+
     // TODO: this requires TE >= v0.11 which is not available in 23.06.
     //        please uncomment this test once mcore CI is ready.
     stage('L2: Community LLM Checkpoints tests') {
@@ -407,7 +420,6 @@ pipeline {
             --in-file=/home/TestData/nlp/megatron_llama/llama-ci-hf \
             --out-file=/home/TestData/nlp/megatron_llama/ci.nemo \
             --precision=16'
-            sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo'
           }
         }
         stage('StarCoder') {
@@ -439,6 +451,29 @@ pipeline {
       }
     }
 
+    stage('L2: Nemo PTQ') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      parallel {
+        stage('Llama') {
+          steps {
+            sh 'CUDA_VISIBLE_DEVICES=0 python examples/nlp/language_modeling/megatron_llama_quantization.py \
+            model_file=/home/TestData/nlp/megatron_llama/ci.nemo \
+            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+            quantization.algorithm=int8_sq \
+            quantization.num_calib_size=8 \
+            inference.batch_size=2 \
+            model_save_path=/home/TestData/nlp/megatron_llama/ci.qnemo'
+            sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo'
+          }
+        }
+      }
+    }
     stage('L2: ASR dev run') {
       when {
         anyOf {
diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py
index 5750122eb1be..707579e24350 100644
--- a/tests/setup/__main__.py
+++ b/tests/setup/__main__.py
@@ -7,25 +7,23 @@
 print("Setup test data and models...")
 
 parser = argparse.ArgumentParser("Setup test data and models.")
-parser.add_argument("--data_dir", required=True, help="Root save directory for data")
-parser.add_argument("--model_dir", required=True, help="Root save directory for models")
+parser.add_argument("--save_dir", required=True, help="Root save directory for artifacts")
 parser.add_argument("--overwrite", action="store_true", help="Overwrite existing files and directories")
 args = parser.parse_args()
 
 print(f"Arguments are: {vars(args)}")
 
-os.makedirs(args.data_dir, exist_ok=True)
-os.makedirs(args.model_dir, exist_ok=True)
+os.makedirs(args.save_dir, exist_ok=True)
 
 create_sample_jsonl(
-    os.path.join(args.data_dir, "test_quantization", "test.json"),
-    args.overwrite,
+    output_file=os.path.join(args.save_dir, "test_quantization", "test.json"),
+    overwrite=args.overwrite,
 )
 
 create_hf_model(
-    "meta-llama/Llama-2-7b-hf",
-    os.path.join(args.model_dir, "tiny_llama2_hf"),
-    {"hidden_size": 128, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4},
-    args.overwrite,
+    model_name_or_path="/home/TestData/nlp/megatron_llama/llama-ci-hf",  # FIXME: change to "meta-llama/Llama-2-7b-hf"
+    output_dir=os.path.join(args.save_dir, "tiny_llama2_hf"),
+    config_updates={"hidden_size": 128, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4},
+    overwrite=args.overwrite,
 )
 print("Setup done.")

From 71d952996812512467da42073409efa86e33adfa Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 16 Feb 2024 18:35:51 +0000
Subject: [PATCH 03/30] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../language_modeling/megatron/model_specs.py    |  9 ++-------
 .../language_modeling/megatron_gpt_model.py      |  1 +
 nemo/export/quantize/quantizer.py                | 10 ++--------
 tests/setup/__main__.py                          |  3 +--
 tests/setup/models/create_hf_model.py            | 16 ++++------------
 5 files changed, 10 insertions(+), 29 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py
index 8b9b6868b32c..9f0b3ac5ca74 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py
@@ -19,18 +19,13 @@ def get_gpt_layer_ammo_spec() -> ModuleSpec:
                 module=SelfAttention,
                 params={"attn_mask_type": AttnMaskType.causal},
                 submodules=SelfAttentionSubmodules(
-                    linear_qkv=ColumnParallelLinear,
-                    core_attention=DotProductAttention,
-                    linear_proj=RowParallelLinear,
+                    linear_qkv=ColumnParallelLinear, core_attention=DotProductAttention, linear_proj=RowParallelLinear,
                 ),
             ),
             self_attn_bda=get_bias_dropout_add,
             pre_mlp_layernorm=TENorm,
             mlp=ModuleSpec(
-                module=MLP,
-                submodules=MLPSubmodules(
-                    linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,
-                ),
+                module=MLP, submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,),
             ),
             mlp_bda=get_bias_dropout_add,
             sharded_state_dict_keys_map={
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 8c213b0d04f6..d5024495889e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -45,6 +45,7 @@
 )
 from nemo.collections.nlp.models.language_modeling.megatron.model_specs import get_gpt_layer_ammo_spec
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
+from nemo.collections.nlp.models.language_modeling.megatron.model_specs import get_gpt_layer_ammo_spec
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
 from nemo.collections.nlp.modules.common.megatron.build_model import build_model
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index e6d0b997d382..cfc89db5f05c 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -77,10 +77,7 @@ def _load_model(self, model_file, tensor_model_parallel_size: Optional[int] = No
         model_cfg = self._restore_and_modify_config(model_file, trainer, connector, tensor_model_parallel_size)
 
         model = MegatronGPTModel.restore_from(
-            restore_path=model_file,
-            trainer=trainer,
-            override_config_path=model_cfg,
-            save_restore_connector=connector,
+            restore_path=model_file, trainer=trainer, override_config_path=model_cfg, save_restore_connector=connector,
         )
         model.freeze()
 
@@ -110,10 +107,7 @@ def _restore_and_modify_config(
         tensor_model_parallel_size: Optional[int] = None,
     ):
         model_cfg = MegatronGPTModel.restore_from(
-            restore_path=model_file,
-            trainer=trainer,
-            save_restore_connector=connector,
-            return_config=True,
+            restore_path=model_file, trainer=trainer, save_restore_connector=connector, return_config=True,
         )
         with open_dict(model_cfg):
             model_cfg.activations_checkpoint_method = None
diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py
index 707579e24350..8773a8b7fa55 100644
--- a/tests/setup/__main__.py
+++ b/tests/setup/__main__.py
@@ -16,8 +16,7 @@
 os.makedirs(args.save_dir, exist_ok=True)
 
 create_sample_jsonl(
-    output_file=os.path.join(args.save_dir, "test_quantization", "test.json"),
-    overwrite=args.overwrite,
+    output_file=os.path.join(args.save_dir, "test_quantization", "test.json"), overwrite=args.overwrite,
 )
 
 create_hf_model(
diff --git a/tests/setup/models/create_hf_model.py b/tests/setup/models/create_hf_model.py
index 5d40d9742628..dd5d98251e64 100644
--- a/tests/setup/models/create_hf_model.py
+++ b/tests/setup/models/create_hf_model.py
@@ -65,24 +65,16 @@ def create_hf_model(
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("Create a HuggingFace model (random initialization) for testing purposes.")
     parser.add_argument(
-        "--model_name_or_path",
-        required=True,
-        help="Model name or local path with model config and tokenizer",
+        "--model_name_or_path", required=True, help="Model name or local path with model config and tokenizer",
     )
     parser.add_argument(
-        "--output_dir",
-        required=True,
-        help="Output directory",
+        "--output_dir", required=True, help="Output directory",
     )
     parser.add_argument(
-        "--config_updates",
-        type=json.loads,
-        help="Parameter updates in JSON format to overwrite for model config",
+        "--config_updates", type=json.loads, help="Parameter updates in JSON format to overwrite for model config",
     )
     parser.add_argument(
-        "--overwrite",
-        action="store_true",
-        help="Overwrite file if it exists",
+        "--overwrite", action="store_true", help="Overwrite file if it exists",
     )
     args = parser.parse_args()
     create_hf_model(args.model_name_or_path, args.output_dir, args.config_updates)

From 9c2d7f4996c8098275566008a4d58610f8b06d56 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Mon, 19 Feb 2024 10:55:32 +0100
Subject: [PATCH 04/30] License headers

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 .../megatron_llama_quantization.py                 | 14 ++++++++++++++
 .../language_modeling/megatron/model_specs.py      | 14 ++++++++++++++
 nemo/export/__init__.py                            | 13 +++++++++++++
 nemo/export/quantize/__init__.py                   | 14 ++++++++++++++
 nemo/export/quantize/quantizer.py                  | 14 ++++++++++++++
 nemo/export/quantize/utils_wip.py                  | 14 ++++++++++++++
 tests/setup/__main__.py                            | 14 ++++++++++++++
 tests/setup/data/create_sample_jsonl.py            | 14 ++++++++++++++
 tests/setup/models/create_hf_model.py              | 14 ++++++++++++++
 9 files changed, 125 insertions(+)

diff --git a/examples/nlp/language_modeling/megatron_llama_quantization.py b/examples/nlp/language_modeling/megatron_llama_quantization.py
index fcb8fa0e86d5..5565900f901b 100644
--- a/examples/nlp/language_modeling/megatron_llama_quantization.py
+++ b/examples/nlp/language_modeling/megatron_llama_quantization.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import torch.multiprocessing as mp
 from datasets import load_dataset
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py
index 9f0b3ac5ca74..d7fb633b3eda 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # TODO: This will be a part of MCore
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
diff --git a/nemo/export/__init__.py b/nemo/export/__init__.py
index e69de29bb2d1..d9155f923f18 100644
--- a/nemo/export/__init__.py
+++ b/nemo/export/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/export/quantize/__init__.py b/nemo/export/quantize/__init__.py
index f89c700da6fe..87812e621bb6 100644
--- a/nemo/export/quantize/__init__.py
+++ b/nemo/export/quantize/__init__.py
@@ -1 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .quantizer import Quantizer
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index cfc89db5f05c..37b635d30d2d 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import copy
 import os
 import tarfile
diff --git a/nemo/export/quantize/utils_wip.py b/nemo/export/quantize/utils_wip.py
index 58ef8f09769e..0406353401ff 100644
--- a/nemo/export/quantize/utils_wip.py
+++ b/nemo/export/quantize/utils_wip.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import contextlib
 import copy
 import os
diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py
index 8773a8b7fa55..51cdab795a99 100644
--- a/tests/setup/__main__.py
+++ b/tests/setup/__main__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import os
 
diff --git a/tests/setup/data/create_sample_jsonl.py b/tests/setup/data/create_sample_jsonl.py
index ee9bd3b48f7e..00f789548f81 100644
--- a/tests/setup/data/create_sample_jsonl.py
+++ b/tests/setup/data/create_sample_jsonl.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import json
 import os
diff --git a/tests/setup/models/create_hf_model.py b/tests/setup/models/create_hf_model.py
index dd5d98251e64..9f57d5996dfc 100644
--- a/tests/setup/models/create_hf_model.py
+++ b/tests/setup/models/create_hf_model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import json
 import os

From ae88e4744e43f0c7afc7d6bc8739a42a05a18209 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Mon, 19 Feb 2024 10:55:50 +0100
Subject: [PATCH 05/30] Add AMMO to requirements_nlp.txt with --extra-index-url
 for pip install

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 Dockerfile                        | 4 +---
 Jenkinsfile                       | 7 -------
 reinstall.sh                      | 2 +-
 requirements/requirements_nlp.txt | 1 +
 4 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index da3f1100be2f..de85b35bf253 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -126,14 +126,12 @@ RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_k2.sh); INSTALL
 WORKDIR /tmp/nemo
 ENV LHOTSE_REQUIRE_TORCHAUDIO=0
 COPY requirements .
-RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done
+RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --extra-index-url https://pypi.nvidia.com --no-cache-dir -r $f; done
 
 # install flash attention
 RUN pip install flash-attn
 # install numba for latest containers
 RUN pip install numba>=0.57.1
-# install AMMO  # TODO: add to requirements
-RUN pip install nvidia-ammo==0.7.2 --extra-index-url https://pypi.nvidia.com --no-cache-dir
 
 # copy nemo source into a scratch image
 FROM scratch as nemo-src
diff --git a/Jenkinsfile b/Jenkinsfile
index 8ab088482491..8902151c603f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -96,13 +96,6 @@ pipeline {
       }
     }
 
-    // TODO: AMMO installation - move to requirements
-    stage('AMMO installation') {
-      steps {
-         sh 'pip install nvidia-ammo==0.7.2 --extra-index-url https://pypi.nvidia.com --no-cache-dir'
-      }
-    }
-
     stage('PyTorch Lightning version') {
       steps {
         sh 'python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"'
diff --git a/reinstall.sh b/reinstall.sh
index d64b56103dd3..a5004590c7c1 100755
--- a/reinstall.sh
+++ b/reinstall.sh
@@ -34,7 +34,7 @@ else
     ${PIP} install build pytest-runner
     python -m build --no-isolation --wheel
     DIST_FILE=$(find ./dist -name "*.whl" | head -n 1)
-    ${PIP} install "${DIST_FILE}[all]"
+    ${PIP} install --extra-index-url https://pypi.nvidia.com "${DIST_FILE}[all]"
 fi
 
 echo 'All done!'
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 2484328293e1..888e66d194b3 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -12,6 +12,7 @@ markdown2
 matplotlib>=3.3.2
 megatron_core==0.5.0
 nltk>=3.6.5
+nvidia-ammo==0.7.2
 opencc<1.1.7
 pangu
 rapidfuzz

From 6ca03d4cd8560b20d0eb4fe1b8a3e9433a2665c9 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Thu, 22 Feb 2024 15:22:53 +0100
Subject: [PATCH 06/30] Bump AMMO version to latest

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 requirements/requirements_nlp.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 888e66d194b3..4bffca663d71 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -12,7 +12,7 @@ markdown2
 matplotlib>=3.3.2
 megatron_core==0.5.0
 nltk>=3.6.5
-nvidia-ammo==0.7.2
+nvidia-ammo==0.7.3
 opencc<1.1.7
 pangu
 rapidfuzz

From 5170db5c63c7902f9067114abe63cd63385f6370 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Thu, 22 Feb 2024 15:28:42 +0100
Subject: [PATCH 07/30] Guards workaround on spec definition

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 .../language_modeling/megatron/model_specs.py | 30 ++++++++++++-------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py
index d7fb633b3eda..006f5d730045 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py
@@ -12,19 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: This will be a part of MCore
-from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
-from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
-from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
-from megatron.core.transformer.custom_layers.transformer_engine import TENorm
-from megatron.core.transformer.dot_product_attention import DotProductAttention
-from megatron.core.transformer.enums import AttnMaskType
-from megatron.core.transformer.mlp import MLP, MLPSubmodules
-from megatron.core.transformer.spec_utils import ModuleSpec
-from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+# TODO: This spec will be defined in MCore>=0.6.0 and is temporary
+try:
+
+    from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+    from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+    from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+    from megatron.core.transformer.custom_layers.transformer_engine import TENorm
+    from megatron.core.transformer.dot_product_attention import DotProductAttention
+    from megatron.core.transformer.enums import AttnMaskType
+    from megatron.core.transformer.mlp import MLP, MLPSubmodules
+    from megatron.core.transformer.spec_utils import ModuleSpec
+    from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+    ModuleSpec = None
 
 
 def get_gpt_layer_ammo_spec() -> ModuleSpec:
+    assert HAVE_MEGATRON_CORE
     return ModuleSpec(
         module=TransformerLayer,
         submodules=TransformerLayerSubmodules(

From 543dea1fa934fa6310c2fb20439efd9eef7e8b1f Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Thu, 29 Feb 2024 10:56:12 +0100
Subject: [PATCH 08/30] Save artifacts and tokenizer config at once

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 nemo/export/quantize/quantizer.py |  5 ++---
 nemo/export/quantize/utils_wip.py | 20 ++++++++++++--------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 37b635d30d2d..0e84d6bdad48 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -32,7 +32,7 @@
 from nemo.utils import logging
 from nemo.utils.get_rank import is_global_rank_zero
 
-from .utils_wip import copy_artifacts, temporary_directory  # TODO: Find a good place for these utils
+from .utils_wip import save_artifacts, temporary_directory  # TODO: Find a good place for these utils
 
 QUANT_CFG_CHOICES = {
     "int8": atq.INT8_DEFAULT_CFG,
@@ -161,6 +161,5 @@ def export(self, model, output_file: str, decoder_type: str, dtype: str, inferen
             if is_global_rank_zero():
                 logging.info(f"Exporting quantized weights, tokenizer config, and model artifacts to {output_file}...")
                 with tarfile.open(output_file, "w:gz") as tar:
-                    config = copy_artifacts(model, tmp_dir)
-                    OmegaConf.save(config.tokenizer, os.path.join(tmp_dir, "tokenizer_config.yaml"))
+                    save_artifacts(model, tmp_dir)
                     tar.add(tmp_dir, arcname="./")
diff --git a/nemo/export/quantize/utils_wip.py b/nemo/export/quantize/utils_wip.py
index 0406353401ff..74ab5471de1e 100644
--- a/nemo/export/quantize/utils_wip.py
+++ b/nemo/export/quantize/utils_wip.py
@@ -47,11 +47,11 @@ def temporary_directory():
     dist.barrier()
 
 
-def copy_artifacts(model, output_dir: str):
-    """Copy all model artifacts to a given output directory and return modified config."""
+def save_artifacts(model, output_dir: str, use_abspath: bool = False) -> None:
+    """Save all model artifacts and tokenizer config to a given output directory."""
     app_state = AppState()
     model_file = app_state.model_restore_path
-    model_config = copy.deepcopy(model.cfg)
+    model_cfg = copy.deepcopy(model.cfg)
 
     # Setup model file handling context: directory or tarball
     if os.path.isfile(model_file):
@@ -66,11 +66,15 @@ def copy_artifacts(model, output_dir: str):
     # Copy or extract artifacts depending on the context
     with model_file_handler(**kwargs) as maybe_tar:
         for arti_name, arti_item in model.artifacts.items():
-            _, arti_file = arti_item.path.split("nemo:")
+            arti_file = arti_item.path.removeprefix("nemo:")
+            arti_path = os.path.join(output_dir, arti_name)
             if maybe_tar is not None:
                 maybe_tar.extract(f"./{arti_file}", path=output_dir)
+                os.rename(os.path.join(output_dir, arti_file), arti_path)
             else:
-                shutil.copy(os.path.join(model_file, arti_file), output_dir)
-            # Update artifact path to basename
-            OmegaConf.update(model_config, arti_name, os.path.basename(arti_file))
-    return model_config
+                shutil.copy(os.path.join(model_file, arti_file), arti_path)
+            # Store artifact path as basename by default. Otherwise save absolute path but bear in mind
+            # that in this case output directory should be permanent for correct artifact recovery later
+            arti_path = os.path.abspath(arti_path) if use_abspath else os.path.basename(arti_path)
+            OmegaConf.update(model_cfg, arti_name, arti_path)
+    OmegaConf.save(model_cfg.tokenizer, os.path.join(output_dir, "tokenizer_config.yaml"))

From 785167fe0d8d8b3f0097de048b9d6325060730f2 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Thu, 29 Feb 2024 11:27:31 +0100
Subject: [PATCH 09/30] Extend nemo.utils package with new tools

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 nemo/export/quantize/quantizer.py |  4 +-
 nemo/export/quantize/utils_wip.py | 80 -------------------------------
 nemo/utils/distributed.py         | 23 +++++++++
 nemo/utils/model_utils.py         | 36 ++++++++++++++
 4 files changed, 61 insertions(+), 82 deletions(-)
 delete mode 100644 nemo/export/quantize/utils_wip.py

diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 0e84d6bdad48..4f3ea26513cf 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -30,9 +30,9 @@
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
 from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.utils import logging
+from nemo.utils.distributed import temporary_directory
 from nemo.utils.get_rank import is_global_rank_zero
-
-from .utils_wip import save_artifacts, temporary_directory  # TODO: Find a good place for these utils
+from nemo.utils.model_utils import save_artifacts
 
 QUANT_CFG_CHOICES = {
     "int8": atq.INT8_DEFAULT_CFG,
diff --git a/nemo/export/quantize/utils_wip.py b/nemo/export/quantize/utils_wip.py
deleted file mode 100644
index 74ab5471de1e..000000000000
--- a/nemo/export/quantize/utils_wip.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import copy
-import os
-import shutil
-import tarfile
-import tempfile
-
-import torch
-import torch.distributed as dist
-from omegaconf import OmegaConf
-
-from nemo.utils.app_state import AppState
-from nemo.utils.get_rank import get_rank, is_global_rank_zero
-
-
-@contextlib.contextmanager
-def temporary_directory():
-    """Create a shared temporary directory across ranks in distributed setup.
-
-    This function assumes that the distributed setup has been already
-    correctly initialized. It is intended to be used only in single-node
-    setup so that all ranks can access the directory created."""
-
-    if is_global_rank_zero():
-        tmp_dir = [tempfile.TemporaryDirectory()]
-    else:
-        tmp_dir = [None]
-    torch.distributed.broadcast_object_list(tmp_dir)
-    print(f"[{get_rank()}] tmp_dir={tmp_dir}")  # TODO: remove debug print
-    yield tmp_dir[0].name
-    # We use barrier below to make sure that rank zero won't exit
-    # and delete tmp_dir while other ranks may still use it
-    dist.barrier()
-
-
-def save_artifacts(model, output_dir: str, use_abspath: bool = False) -> None:
-    """Save all model artifacts and tokenizer config to a given output directory."""
-    app_state = AppState()
-    model_file = app_state.model_restore_path
-    model_cfg = copy.deepcopy(model.cfg)
-
-    # Setup model file handling context: directory or tarball
-    if os.path.isfile(model_file):
-        model_file_handler = tarfile.open
-        kwargs = {"name": model_file, "mode": "r:"}
-    elif os.path.isdir(model_file):
-        model_file_handler = contextlib.nullcontext
-        kwargs = {}
-    else:
-        raise FileNotFoundError(model_file)
-
-    # Copy or extract artifacts depending on the context
-    with model_file_handler(**kwargs) as maybe_tar:
-        for arti_name, arti_item in model.artifacts.items():
-            arti_file = arti_item.path.removeprefix("nemo:")
-            arti_path = os.path.join(output_dir, arti_name)
-            if maybe_tar is not None:
-                maybe_tar.extract(f"./{arti_file}", path=output_dir)
-                os.rename(os.path.join(output_dir, arti_file), arti_path)
-            else:
-                shutil.copy(os.path.join(model_file, arti_file), arti_path)
-            # Store artifact path as basename by default. Otherwise save absolute path but bear in mind
-            # that in this case output directory should be permanent for correct artifact recovery later
-            arti_path = os.path.abspath(arti_path) if use_abspath else os.path.basename(arti_path)
-            OmegaConf.update(model_cfg, arti_name, arti_path)
-    OmegaConf.save(model_cfg.tokenizer, os.path.join(output_dir, "tokenizer_config.yaml"))
diff --git a/nemo/utils/distributed.py b/nemo/utils/distributed.py
index b0d24de3e5b4..ee6c107b1d85 100644
--- a/nemo/utils/distributed.py
+++ b/nemo/utils/distributed.py
@@ -12,11 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
 import os
+import tempfile
 
 import torch
+import torch.distributed as dist
 
 from nemo.utils import logging
+from nemo.utils.get_rank import is_global_rank_zero
 
 try:
     from megatron.core import parallel_state
@@ -100,3 +104,22 @@ def gather_objects(partial_results_list, main_rank=None):
         results_list.extend(r)
 
     return results_list
+
+
+@contextlib.contextmanager
+def temporary_directory():
+    """Create a shared temporary directory across ranks in distributed setup.
+
+    This function assumes that the distributed setup has been already
+    correctly initialized. It is intended to be used only in single-node
+    setup so that all ranks can access the directory created."""
+
+    if is_global_rank_zero():
+        tmp_dir = [tempfile.TemporaryDirectory()]
+    else:
+        tmp_dir = [None]
+    dist.broadcast_object_list(tmp_dir)
+    yield tmp_dir[0].name
+    # We use barrier below to make sure that rank zero won't exit
+    # and delete tmp_dir while other ranks may still use it
+    dist.barrier()
diff --git a/nemo/utils/model_utils.py b/nemo/utils/model_utils.py
index b2a6abbf54aa..c7497511572a 100644
--- a/nemo/utils/model_utils.py
+++ b/nemo/utils/model_utils.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
 import copy
 import importlib
 import os
+import shutil
+import tarfile
 from dataclasses import dataclass, is_dataclass
 from enum import Enum
 from functools import lru_cache
@@ -636,3 +639,36 @@ def ckpt_to_dir(filepath: Union[str, Path]) -> Path:
     checkpoint_dir = filepath.with_name(filepath.stem)
 
     return checkpoint_dir
+
+
+def save_artifacts(model, output_dir: str, use_abspath: bool = False) -> None:
+    """Save all model artifacts and tokenizer config to a given output directory."""
+    app_state = AppState()
+    model_file = app_state.model_restore_path
+    model_cfg = copy.deepcopy(model.cfg)
+
+    # Setup model file handling context: directory or tarball
+    if os.path.isfile(model_file):
+        model_file_handler = tarfile.open
+        kwargs = {"name": model_file, "mode": "r:"}
+    elif os.path.isdir(model_file):
+        model_file_handler = contextlib.nullcontext
+        kwargs = {}
+    else:
+        raise FileNotFoundError(model_file)
+
+    # Copy or extract artifacts depending on the context
+    with model_file_handler(**kwargs) as maybe_tar:
+        for arti_name, arti_item in model.artifacts.items():
+            _, arti_file = arti_item.path.split("nemo:")
+            arti_path = os.path.join(output_dir, arti_name)
+            if maybe_tar is not None:
+                maybe_tar.extract(f"./{arti_file}", path=output_dir)
+                os.rename(os.path.join(output_dir, arti_file), arti_path)
+            else:
+                shutil.copy(os.path.join(model_file, arti_file), arti_path)
+            # Store artifact path as basename by default. Otherwise save absolute path but bear in mind
+            # that in this case output directory should be permanent for correct artifact recovery later
+            arti_path = os.path.abspath(arti_path) if use_abspath else os.path.basename(arti_path)
+            OmegaConf.update(model_cfg, arti_name, arti_path)
+    OmegaConf.save(model_cfg.tokenizer, os.path.join(output_dir, "tokenizer_config.yaml"))

From 0f29ac476e30628607a90be1f88d0e7df8e22c3d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 29 Feb 2024 12:00:17 +0000
Subject: [PATCH 10/30] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../nlp/models/language_modeling/megatron_gpt_model.py           | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index d5024495889e..3e6ba169ff3c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -43,7 +43,6 @@
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_full_te_layer_autocast_spec import (
     get_gpt_full_te_layer_autocast_spec,
 )
-from nemo.collections.nlp.models.language_modeling.megatron.model_specs import get_gpt_layer_ammo_spec
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
 from nemo.collections.nlp.models.language_modeling.megatron.model_specs import get_gpt_layer_ammo_spec
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel

From acfb441375d7d2f5b390ab4fc3d74396a36d2c7b Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Mon, 4 Mar 2024 09:54:37 +0100
Subject: [PATCH 11/30] Reorganize & reformat

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 .../conf/megatron_llama_quantization.yaml     | 14 ++--
 .../megatron_llama_quantization.py            | 15 ++--
 nemo/export/quantize/quantizer.py             | 79 +++++++++++++------
 tests/setup/__main__.py                       |  3 +-
 tests/setup/models/create_hf_model.py         | 16 +++-
 5 files changed, 88 insertions(+), 39 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
index 5603aa9c92ba..b374daa3a638 100644
--- a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
@@ -24,12 +24,14 @@ quantization:
   quantize_bmm1: false
   algorithm: fp8 # int8_sq, fp8, int8, int4_awq
   calib_dataset: cnn_dailymail # pileval, wikitext, cnn_dailymail
-  num_calib_size: 128 # number of samples used for calibration
+  num_calib_size: 512 # number of samples used for calibration
 
+export:
+  decoder_type: llama # gptnext, gpt2, llama
+  inference_tensor_parallel: 1 # Default using 1 TP for inference
+  dtype: 16 # Default precision data type
+
+model_file: llama2-7b-fp16.nemo # Nemo file path
+model_save: llama2-7b-fp8.qnemo # Path where the quantized model will be saved
 tensor_model_parallel_size: 1
 pipeline_model_parallel_size: 1
-decoder_type: llama # gptnext, llama
-model_file: llama2-7b-fp16.nemo # nemo file path
-model_save_path: llama2-7b-fp16.qnemo # Path where the quantized model will be saved
-inference_tensor_parallel: 1 # Default using 1 TP for inference
-dtype: 16 # Default precision data type
diff --git a/examples/nlp/language_modeling/megatron_llama_quantization.py b/examples/nlp/language_modeling/megatron_llama_quantization.py
index 5565900f901b..ebc9897d1fb7 100644
--- a/examples/nlp/language_modeling/megatron_llama_quantization.py
+++ b/examples/nlp/language_modeling/megatron_llama_quantization.py
@@ -32,9 +32,10 @@
 ```
 python examples/nlp/language_modeling/megatron_llama_quantization.py \
     model_file=llama2-7b-fp16.nemo \
-    decoder_type=llama \
-    quantization.algorithm=int8_sq \
-    model_save_path=llama2-7b-fp16.qnemo
+    model_save=llama2-7b-fp8.qnemo \
+    quantization.algorithm=fp8 \
+    export.decoder_type=llama \
+    export.inference_tensor_parallel=1
 ```
 """
 
@@ -66,7 +67,7 @@ def main(cfg) -> None:
     if not torch.cuda.is_available():
         raise EnvironmentError("GPU is required for the inference.")
 
-    quantizer = Quantizer(cfg.quantization, cfg.inference, cfg.trainer)
+    quantizer = Quantizer(cfg.quantization, cfg.inference, cfg.export, cfg.trainer)
 
     dataloader = get_calib_dataloader(
         cfg.quantization.calib_dataset,
@@ -76,9 +77,11 @@ def main(cfg) -> None:
     )
     dataloader = [data for data in dataloader]
 
-    model = quantizer.quantize(cfg.model_file, dataloader, cfg.tensor_model_parallel_size)
+    model = quantizer.quantize(
+        cfg.model_file, dataloader, cfg.tensor_model_parallel_size, cfg.pipeline_model_parallel_size
+    )
 
-    quantizer.export(model, cfg.model_save_path, cfg.decoder_type, cfg.dtype, cfg.inference_tensor_parallel)
+    quantizer.export(model, cfg.model_save)
 
 
 if __name__ == '__main__':
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 4f3ea26513cf..f17978c3e87e 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -15,12 +15,11 @@
 import copy
 import os
 import tarfile
-from typing import Optional
+from typing import List, Optional
 
 import ammo.torch.quantization as atq
 import torch.distributed as dist
 from ammo.torch.export import export_model_config
-from ammo.torch.utils import print_rank_0
 from megatron.core import parallel_state
 from omegaconf import OmegaConf
 from omegaconf.omegaconf import DictConfig, open_dict
@@ -42,6 +41,8 @@
     "w4a8_awq": atq.W4A8_AWQ_BETA_CFG,
 }
 
+SUPPORTED_DTYPE = [16, "16", "bf16"]  # Default precision for non-quantized layers
+
 
 class Quantizer:
 
@@ -64,16 +65,24 @@ class Quantizer:
     model families is experimental and might not be fully supported.
 
     Available quantization methods are listed in QUANT_CFG_CHOICES dictionary on top of this file.
-    Please consult AMMO docummentation for details. You can also ispect different choices in
+    Please consult AMMO documentation for details. You can also inspect different choices in
     examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml for quantization algorithms and
     calibration data as well as recommended settings.
     """
 
-    def __init__(self, quantization_config: DictConfig, inference_config: DictConfig, trainer_config: DictConfig):
+    def __init__(
+        self,
+        quantization_config: DictConfig,
+        inference_config: DictConfig,
+        export_config: DictConfig,
+        trainer_config: DictConfig,
+    ):
+        assert export_config.dtype in SUPPORTED_DTYPE
+        assert quantization_config.algorithm in QUANT_CFG_CHOICES
         self.quantization_config = quantization_config
         self.inference_config = inference_config
+        self.export_config = export_config
         self.trainer_config = trainer_config
-        assert self.quantization_config.algorithm in QUANT_CFG_CHOICES
         atq_config = QUANT_CFG_CHOICES[quantization_config.algorithm]
         if quantization_config.algorithm != "fp8":
             # disable quantization for the last output layer
@@ -81,17 +90,27 @@ def __init__(self, quantization_config: DictConfig, inference_config: DictConfig
             atq_config["quant_cfg"]["*.output_layer.*"] = {"enable": False}
         self.atq_config = atq_config
 
-    def _load_model(self, model_file, tensor_model_parallel_size: Optional[int] = None):
+    def _load_model(
+        self,
+        model_file: str,
+        tensor_model_parallel_size: Optional[int] = None,
+        pipeline_model_parallel_size: Optional[int] = None,
+    ):
         trainer = Trainer(strategy=NLPDDPStrategy(), **self.trainer_config)
         connector = NLPSaveRestoreConnector()
 
         if os.path.isdir(model_file):
             connector.model_extracted_dir = model_file
 
-        model_cfg = self._restore_and_modify_config(model_file, trainer, connector, tensor_model_parallel_size)
+        model_cfg = self._restore_and_modify_config(
+            model_file, trainer, connector, tensor_model_parallel_size, pipeline_model_parallel_size
+        )
 
         model = MegatronGPTModel.restore_from(
-            restore_path=model_file, trainer=trainer, override_config_path=model_cfg, save_restore_connector=connector,
+            restore_path=model_file,
+            trainer=trainer,
+            override_config_path=model_cfg,
+            save_restore_connector=connector,
         )
         model.freeze()
 
@@ -99,7 +118,10 @@ def _load_model(self, model_file, tensor_model_parallel_size: Optional[int] = No
             model.model.module.language_model.encoder.activations_checkpoint_method = None
         except AttributeError:
             pass
-        print_rank_0(model)
+
+        if is_global_rank_zero():
+            print(model)
+
         self._check_ddp_initialized(model)
         return model
 
@@ -119,47 +141,60 @@ def _restore_and_modify_config(
         trainer: Trainer,
         connector: NLPSaveRestoreConnector,
         tensor_model_parallel_size: Optional[int] = None,
+        pipeline_model_parallel_size: Optional[int] = None,
     ):
         model_cfg = MegatronGPTModel.restore_from(
-            restore_path=model_file, trainer=trainer, save_restore_connector=connector, return_config=True,
+            restore_path=model_file,
+            trainer=trainer,
+            save_restore_connector=connector,
+            return_config=True,
         )
         with open_dict(model_cfg):
             model_cfg.activations_checkpoint_method = None
             model_cfg.activations_checkpoint_granularity = None
             if tensor_model_parallel_size is not None:
                 model_cfg.tensor_model_parallel_size = tensor_model_parallel_size
-            model_cfg.name = "ammo"  # Model needs to be loaded in "ammo" layer spec
+            if pipeline_model_parallel_size is not None:
+                model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size
+            # Only custom AMMO spec is supported for PTQ: this custom spec is largely based on local Megatron-LM
+            # layer definitions to avoid Transformer Engine implementations that are currently not supported.
+            model_cfg.name = "ammo"
 
         return model_cfg
 
-    def quantize(self, model_file: str, dataloader, tensor_model_parallel_size: Optional[int] = None):
-        model = self._load_model(model_file, tensor_model_parallel_size)
+    def quantize(
+        self,
+        model_file: str,
+        dataloader: List[List[str]],
+        tensor_model_parallel_size: Optional[int] = None,
+        pipeline_model_parallel_size: Optional[int] = None,
+    ):
+        model = self._load_model(model_file, tensor_model_parallel_size, pipeline_model_parallel_size)
         model.set_inference_config(OmegaConf.to_container(self.inference_config))
 
         def forward_loop():
             for i, batch in enumerate(dataloader):
-                print_rank_0(f"Calibrating batch {i}")
+                if is_global_rank_zero():
+                    print(f"Calibrating batch {i}")
                 model.predict_step(batch, i)
 
         atq.quantize(model, self.atq_config, forward_loop)
         return model
 
-    def export(self, model, output_file: str, decoder_type: str, dtype: str, inference_tensor_parallel: int):
-        supported_dtype = [16, "16", "bf16"]  # FIXME: Move that to top
-        assert dtype in supported_dtype, f"{dtype} not supported. Supported dtypes are {supported_dtype}"
-        torch_dtype = torch_dtype_from_precision(dtype)
+    def export(self, model, model_save: str):
+        torch_dtype = torch_dtype_from_precision(self.export_config.dtype)
 
         with temporary_directory() as tmp_dir:
             export_model_config(
                 model=model,
-                decoder_type=decoder_type,
+                decoder_type=self.export_config.decoder_type,
                 dtype=torch_dtype,
                 export_dir=tmp_dir,
-                inference_tensor_parallel=inference_tensor_parallel,
+                inference_tensor_parallel=self.export_config.inference_tensor_parallel,
             )
             dist.barrier()  # Wait until all ranks complete export_model_config step
             if is_global_rank_zero():
-                logging.info(f"Exporting quantized weights, tokenizer config, and model artifacts to {output_file}...")
-                with tarfile.open(output_file, "w:gz") as tar:
+                logging.info(f"Exporting quantized weights, model artifacts, and tokenizer config to {model_save}...")
+                with tarfile.open(model_save, "w:gz") as tar:
                     save_artifacts(model, tmp_dir)
                     tar.add(tmp_dir, arcname="./")
diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py
index 51cdab795a99..c9f33cf73996 100644
--- a/tests/setup/__main__.py
+++ b/tests/setup/__main__.py
@@ -30,7 +30,8 @@
 os.makedirs(args.save_dir, exist_ok=True)
 
 create_sample_jsonl(
-    output_file=os.path.join(args.save_dir, "test_quantization", "test.json"), overwrite=args.overwrite,
+    output_file=os.path.join(args.save_dir, "test_quantization", "test.json"),
+    overwrite=args.overwrite,
 )
 
 create_hf_model(
diff --git a/tests/setup/models/create_hf_model.py b/tests/setup/models/create_hf_model.py
index 9f57d5996dfc..fd6c1bdb0277 100644
--- a/tests/setup/models/create_hf_model.py
+++ b/tests/setup/models/create_hf_model.py
@@ -79,16 +79,24 @@ def create_hf_model(
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("Create a HuggingFace model (random initialization) for testing purposes.")
     parser.add_argument(
-        "--model_name_or_path", required=True, help="Model name or local path with model config and tokenizer",
+        "--model_name_or_path",
+        required=True,
+        help="Model name or local path with model config and tokenizer",
     )
     parser.add_argument(
-        "--output_dir", required=True, help="Output directory",
+        "--output_dir",
+        required=True,
+        help="Output directory",
     )
     parser.add_argument(
-        "--config_updates", type=json.loads, help="Parameter updates in JSON format to overwrite for model config",
+        "--config_updates",
+        type=json.loads,
+        help="Parameter updates in JSON format to overwrite for model config",
     )
     parser.add_argument(
-        "--overwrite", action="store_true", help="Overwrite file if it exists",
+        "--overwrite",
+        action="store_true",
+        help="Overwrite file if it exists",
     )
     args = parser.parse_args()
     create_hf_model(args.model_name_or_path, args.output_dir, args.config_updates)

From e03cb87088c60bbb05cc7e53598a2fd748a9249c Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Mon, 4 Mar 2024 10:50:25 +0100
Subject: [PATCH 12/30] Tests for FP8 and INT4 AWQ

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 Jenkinsfile | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 8902151c603f..5af6fc147dee 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -453,15 +453,39 @@ pipeline {
       }
       failFast true
       parallel {
-        stage('Llama') {
+        stage('Llama2 - INT8 SQ') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES=0 python examples/nlp/language_modeling/megatron_llama_quantization.py \
+            sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
             model_file=/home/TestData/nlp/megatron_llama/ci.nemo \
             quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
             quantization.algorithm=int8_sq \
             quantization.num_calib_size=8 \
             inference.batch_size=2 \
-            model_save_path=/home/TestData/nlp/megatron_llama/ci.qnemo'
+            model_save=/home/TestData/nlp/megatron_llama/ci.qnemo'
+            sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo'
+          }
+        }
+        stage('Llama2 - FP8') {
+          steps {
+            sh 'mpirun -n 2 --allow-run-as-root python examples/nlp/language_modeling/megatron_llama_quantization.py \
+            model_file=/home/TestData/nlp/megatron_llama/ci.nemo \
+            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+            quantization.algorithm=fp8 \
+            quantization.num_calib_size=8 \
+            inference.batch_size=2 \
+            model_save=/home/TestData/nlp/megatron_llama/ci.qnemo'
+            sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo'
+          }
+        }
+        stage('Llama2 - INT4 AWQ') {
+          steps {
+            sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
+            model_file=/home/TestData/nlp/megatron_llama/ci.nemo \
+            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+            quantization.algorithm=int4_awq \
+            quantization.num_calib_size=8 \
+            inference.batch_size=2 \
+            model_save=/home/TestData/nlp/megatron_llama/ci.qnemo'
             sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo'
           }
         }

From 4b49ec6b2bd73fe7daba3d8bccf08023b0c1a536 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 4 Mar 2024 10:07:27 +0000
Subject: [PATCH 13/30] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 nemo/export/quantize/quantizer.py     | 10 ++--------
 tests/setup/__main__.py               |  3 +--
 tests/setup/models/create_hf_model.py | 16 ++++------------
 3 files changed, 7 insertions(+), 22 deletions(-)

diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index f17978c3e87e..54f0657a44b7 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -107,10 +107,7 @@ def _load_model(
         )
 
         model = MegatronGPTModel.restore_from(
-            restore_path=model_file,
-            trainer=trainer,
-            override_config_path=model_cfg,
-            save_restore_connector=connector,
+            restore_path=model_file, trainer=trainer, override_config_path=model_cfg, save_restore_connector=connector,
         )
         model.freeze()
 
@@ -144,10 +141,7 @@ def _restore_and_modify_config(
         pipeline_model_parallel_size: Optional[int] = None,
     ):
         model_cfg = MegatronGPTModel.restore_from(
-            restore_path=model_file,
-            trainer=trainer,
-            save_restore_connector=connector,
-            return_config=True,
+            restore_path=model_file, trainer=trainer, save_restore_connector=connector, return_config=True,
         )
         with open_dict(model_cfg):
             model_cfg.activations_checkpoint_method = None
diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py
index c9f33cf73996..51cdab795a99 100644
--- a/tests/setup/__main__.py
+++ b/tests/setup/__main__.py
@@ -30,8 +30,7 @@
 os.makedirs(args.save_dir, exist_ok=True)
 
 create_sample_jsonl(
-    output_file=os.path.join(args.save_dir, "test_quantization", "test.json"),
-    overwrite=args.overwrite,
+    output_file=os.path.join(args.save_dir, "test_quantization", "test.json"), overwrite=args.overwrite,
 )
 
 create_hf_model(
diff --git a/tests/setup/models/create_hf_model.py b/tests/setup/models/create_hf_model.py
index fd6c1bdb0277..9f57d5996dfc 100644
--- a/tests/setup/models/create_hf_model.py
+++ b/tests/setup/models/create_hf_model.py
@@ -79,24 +79,16 @@ def create_hf_model(
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("Create a HuggingFace model (random initialization) for testing purposes.")
     parser.add_argument(
-        "--model_name_or_path",
-        required=True,
-        help="Model name or local path with model config and tokenizer",
+        "--model_name_or_path", required=True, help="Model name or local path with model config and tokenizer",
     )
     parser.add_argument(
-        "--output_dir",
-        required=True,
-        help="Output directory",
+        "--output_dir", required=True, help="Output directory",
     )
     parser.add_argument(
-        "--config_updates",
-        type=json.loads,
-        help="Parameter updates in JSON format to overwrite for model config",
+        "--config_updates", type=json.loads, help="Parameter updates in JSON format to overwrite for model config",
     )
     parser.add_argument(
-        "--overwrite",
-        action="store_true",
-        help="Overwrite file if it exists",
+        "--overwrite", action="store_true", help="Overwrite file if it exists",
     )
     args = parser.parse_args()
     create_hf_model(args.model_name_or_path, args.output_dir, args.config_updates)

From 90988b141a75b06ddb25aeca5bfaa3563d25ee1d Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Mon, 4 Mar 2024 11:19:04 +0100
Subject: [PATCH 14/30] Add load_config helper function

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 nemo/export/quantize/quantizer.py | 25 +++++++++++--------------
 nemo/utils/model_utils.py         | 13 +++++++++++++
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 54f0657a44b7..571d398850bc 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -31,7 +31,7 @@
 from nemo.utils import logging
 from nemo.utils.distributed import temporary_directory
 from nemo.utils.get_rank import is_global_rank_zero
-from nemo.utils.model_utils import save_artifacts
+from nemo.utils.model_utils import load_config, save_artifacts
 
 QUANT_CFG_CHOICES = {
     "int8": atq.INT8_DEFAULT_CFG,
@@ -96,16 +96,14 @@ def _load_model(
         tensor_model_parallel_size: Optional[int] = None,
         pipeline_model_parallel_size: Optional[int] = None,
     ):
+        """Load model using AMMO layer spec for quantization."""
+        model_cfg = self._load_and_modify_config(
+            model_file, tensor_model_parallel_size, pipeline_model_parallel_size
+        )
+
         trainer = Trainer(strategy=NLPDDPStrategy(), **self.trainer_config)
         connector = NLPSaveRestoreConnector()
 
-        if os.path.isdir(model_file):
-            connector.model_extracted_dir = model_file
-
-        model_cfg = self._restore_and_modify_config(
-            model_file, trainer, connector, tensor_model_parallel_size, pipeline_model_parallel_size
-        )
-
         model = MegatronGPTModel.restore_from(
             restore_path=model_file, trainer=trainer, override_config_path=model_cfg, save_restore_connector=connector,
         )
@@ -132,17 +130,14 @@ def dummy():
                 model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
             model.trainer.strategy.setup_environment()
 
-    def _restore_and_modify_config(
+    def _load_and_modify_config(
         self,
         model_file: str,
-        trainer: Trainer,
-        connector: NLPSaveRestoreConnector,
         tensor_model_parallel_size: Optional[int] = None,
         pipeline_model_parallel_size: Optional[int] = None,
     ):
-        model_cfg = MegatronGPTModel.restore_from(
-            restore_path=model_file, trainer=trainer, save_restore_connector=connector, return_config=True,
-        )
+        model_cfg = load_config(model_file)
+
         with open_dict(model_cfg):
             model_cfg.activations_checkpoint_method = None
             model_cfg.activations_checkpoint_granularity = None
@@ -163,6 +158,7 @@ def quantize(
         tensor_model_parallel_size: Optional[int] = None,
         pipeline_model_parallel_size: Optional[int] = None,
     ):
+        """Quantize model checkpoint using given dataloader and optional custom parallelism settings."""
         model = self._load_model(model_file, tensor_model_parallel_size, pipeline_model_parallel_size)
         model.set_inference_config(OmegaConf.to_container(self.inference_config))
 
@@ -176,6 +172,7 @@ def forward_loop():
         return model
 
     def export(self, model, model_save: str):
+        """Export model to '.qnemo' format for TensorRT-LLM engine build."""
         torch_dtype = torch_dtype_from_precision(self.export_config.dtype)
 
         with temporary_directory() as tmp_dir:
diff --git a/nemo/utils/model_utils.py b/nemo/utils/model_utils.py
index c7497511572a..8889f13d5b98 100644
--- a/nemo/utils/model_utils.py
+++ b/nemo/utils/model_utils.py
@@ -18,6 +18,7 @@
 import os
 import shutil
 import tarfile
+import tempfile
 from dataclasses import dataclass, is_dataclass
 from enum import Enum
 from functools import lru_cache
@@ -64,6 +65,18 @@ class ArtifactItem:
     hashed_path: Optional[str] = None
 
 
+def load_config(model_file: str) -> DictConfig:
+    """Load model config from extracted directory or '.nemo' tarball."""
+    if os.path.isfile(model_file):
+        with tempfile.TemporaryDirectory() as tmp, tarfile.open(model_file, "r:") as tar:
+            tar.extract("./model_config.yaml", path=tmp)
+            model_config = OmegaConf.load(os.path.join(tmp, "model_config.yaml"))
+    else:
+        model_config = OmegaConf.load(os.path.join(model_file, "model_config.yaml"))
+
+    return model_config
+
+
 def resolve_dataset_name_from_cfg(cfg: 'DictConfig') -> Optional[str]:
     """
     Parses items of the provided sub-config to find the first potential key that

From d115befde42778972612facd83ff43cddf410340 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 4 Mar 2024 10:20:30 +0000
Subject: [PATCH 15/30] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 nemo/export/quantize/quantizer.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 571d398850bc..998935cac214 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -97,9 +97,7 @@ def _load_model(
         pipeline_model_parallel_size: Optional[int] = None,
     ):
         """Load model using AMMO layer spec for quantization."""
-        model_cfg = self._load_and_modify_config(
-            model_file, tensor_model_parallel_size, pipeline_model_parallel_size
-        )
+        model_cfg = self._load_and_modify_config(model_file, tensor_model_parallel_size, pipeline_model_parallel_size)
 
         trainer = Trainer(strategy=NLPDDPStrategy(), **self.trainer_config)
         connector = NLPSaveRestoreConnector()

From 6fcbcd06f48441d1a7fc7f2a05ec80dea729171c Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Mon, 4 Mar 2024 11:23:31 +0100
Subject: [PATCH 16/30] Unused import removal

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 nemo/export/quantize/quantizer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 998935cac214..dc044e52870a 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import copy
-import os
 import tarfile
 from typing import List, Optional
 

From a5e818f08e4395e6df23795651a211efefdc6331 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Mon, 4 Mar 2024 12:39:13 +0100
Subject: [PATCH 17/30] Fix FP8 Jenkins test

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 Jenkinsfile | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 5af6fc147dee..a560f284d8a8 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -462,19 +462,21 @@ pipeline {
             quantization.num_calib_size=8 \
             inference.batch_size=2 \
             model_save=/home/TestData/nlp/megatron_llama/ci.qnemo'
-            sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo'
+            sh 'rm -f /home/TestData/nlp/megatron_llama/ci.qnemo'
           }
         }
         stage('Llama2 - FP8') {
           steps {
             sh 'mpirun -n 2 --allow-run-as-root python examples/nlp/language_modeling/megatron_llama_quantization.py \
             model_file=/home/TestData/nlp/megatron_llama/ci.nemo \
+            tensor_model_parallel_size=2 \
+            trainer.devices=2 \
             quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
             quantization.algorithm=fp8 \
             quantization.num_calib_size=8 \
             inference.batch_size=2 \
             model_save=/home/TestData/nlp/megatron_llama/ci.qnemo'
-            sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo'
+            sh 'rm -f /home/TestData/nlp/megatron_llama/ci.qnemo'
           }
         }
         stage('Llama2 - INT4 AWQ') {
@@ -486,11 +488,12 @@ pipeline {
             quantization.num_calib_size=8 \
             inference.batch_size=2 \
             model_save=/home/TestData/nlp/megatron_llama/ci.qnemo'
-            sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo'
+            sh 'rm -f /home/TestData/nlp/megatron_llama/ci.qnemo'
           }
         }
       }
     }
+
     stage('L2: ASR dev run') {
       when {
         anyOf {

From 12f3717861f64791e2a924e9af9e57108ac1d764 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 5 Mar 2024 12:53:13 +0100
Subject: [PATCH 18/30] Fix TP=2 test cont'd: no need to use mpirun

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index a560f284d8a8..ab4105fa219f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -467,7 +467,7 @@ pipeline {
         }
         stage('Llama2 - FP8') {
           steps {
-            sh 'mpirun -n 2 --allow-run-as-root python examples/nlp/language_modeling/megatron_llama_quantization.py \
+            sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
             model_file=/home/TestData/nlp/megatron_llama/ci.nemo \
             tensor_model_parallel_size=2 \
             trainer.devices=2 \

From a96be0f4d2347850dc222cbe0c8312d805feb003 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 5 Mar 2024 14:40:12 +0100
Subject: [PATCH 19/30] Allow for patches in AMMO versioning

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 requirements/requirements_nlp.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 4bffca663d71..e613bf649692 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -12,7 +12,7 @@ markdown2
 matplotlib>=3.3.2
 megatron_core==0.5.0
 nltk>=3.6.5
-nvidia-ammo==0.7.3
+nvidia-ammo~=0.7.3
 opencc<1.1.7
 pangu
 rapidfuzz

From c99b99231e92539eec165596d2c22d1ebb2e7de7 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 5 Mar 2024 14:41:48 +0100
Subject: [PATCH 20/30] Drop AWQ test for now (need to debug)

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 Jenkinsfile | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index ab4105fa219f..6e550ef68673 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -479,18 +479,6 @@ pipeline {
             sh 'rm -f /home/TestData/nlp/megatron_llama/ci.qnemo'
           }
         }
-        stage('Llama2 - INT4 AWQ') {
-          steps {
-            sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/ci.nemo \
-            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-            quantization.algorithm=int4_awq \
-            quantization.num_calib_size=8 \
-            inference.batch_size=2 \
-            model_save=/home/TestData/nlp/megatron_llama/ci.qnemo'
-            sh 'rm -f /home/TestData/nlp/megatron_llama/ci.qnemo'
-          }
-        }
       }
     }
 

From 2eff82fe66b545c37f69bf9ab00fff1c63366c35 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 5 Mar 2024 16:35:19 +0100
Subject: [PATCH 21/30] Allow for patches in AMMO versioning cont'd

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 requirements/requirements_nlp.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index e613bf649692..984a3aa45478 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -12,7 +12,7 @@ markdown2
 matplotlib>=3.3.2
 megatron_core==0.5.0
 nltk>=3.6.5
-nvidia-ammo~=0.7.3
+nvidia-ammo~=0.7.0
 opencc<1.1.7
 pangu
 rapidfuzz

From 739fe3032767128c85ba7d38c6a4ce7ce5d2925b Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Wed, 6 Mar 2024 10:20:11 +0100
Subject: [PATCH 22/30] Use AMMO spec from MCore as it has been published

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 Dockerfile                                    |  2 +-
 Jenkinsfile                                   |  2 +-
 .../conf/megatron_llama_quantization.yaml     |  2 +-
 .../megatron_llama_quantization.py            |  2 +-
 .../language_modeling/megatron/model_specs.py | 60 -------------------
 .../language_modeling/megatron_gpt_model.py   |  2 +-
 nemo/export/quantize/quantizer.py             |  3 +-
 7 files changed, 7 insertions(+), 66 deletions(-)
 delete mode 100644 nemo/collections/nlp/models/language_modeling/megatron/model_specs.py

diff --git a/Dockerfile b/Dockerfile
index de85b35bf253..81e8f7c73b7b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -66,7 +66,7 @@ WORKDIR /workspace/
 # We leave it here in case we need to work off of a specific commit in main
 RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
   cd Megatron-LM && \
-  git checkout ad53b1e38689a0ceed75ade7821f4e6c7554abb4 && \
+  git checkout 36e9b6bf3d8034b10c9bbd9fc357c2df2bd1515c && \
   pip install .
 
 # Performance optimizations for distributed optimizer: https://github.com/NVIDIA/apex/pull/1771
diff --git a/Jenkinsfile b/Jenkinsfile
index 5a1aabf5559c..5a60530aa47e 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -91,7 +91,7 @@ pipeline {
       steps {
          sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
              cd Megatron-LM && \
-             git checkout 5f9c870f9f24b482509699d206a9dbb00958f6fc && \
+             git checkout 36e9b6bf3d8034b10c9bbd9fc357c2df2bd1515c && \
              pip install .'
       }
     }
diff --git a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
index b374daa3a638..5a5e87eba7c4 100644
--- a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
@@ -9,7 +9,7 @@ inference:
   repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
   min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
   compute_logprob: false  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
-  batch_size: 4 # batch size for inference
+  batch_size: 64 # batch size for inference
   max_context_length: 512 # max length of the context, input sequence will be truncated if it is longer than this
 
 trainer:
diff --git a/examples/nlp/language_modeling/megatron_llama_quantization.py b/examples/nlp/language_modeling/megatron_llama_quantization.py
index ebc9897d1fb7..7b6fe88ba4f6 100644
--- a/examples/nlp/language_modeling/megatron_llama_quantization.py
+++ b/examples/nlp/language_modeling/megatron_llama_quantization.py
@@ -40,7 +40,7 @@
 """
 
 
-def get_calib_dataloader(data="cnn_dailymail", batch_size=4, calib_size=512, max_sequence_length=512):
+def get_calib_dataloader(data="cnn_dailymail", batch_size=64, calib_size=512, max_sequence_length=512):
     if data == "pileval":
         dataset = load_dataset("json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train")
         text_column = "text"
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py b/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py
deleted file mode 100644
index 006f5d730045..000000000000
--- a/nemo/collections/nlp/models/language_modeling/megatron/model_specs.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# TODO: This spec will be defined in MCore>=0.6.0 and is temporary
-try:
-
-    from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
-    from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
-    from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
-    from megatron.core.transformer.custom_layers.transformer_engine import TENorm
-    from megatron.core.transformer.dot_product_attention import DotProductAttention
-    from megatron.core.transformer.enums import AttnMaskType
-    from megatron.core.transformer.mlp import MLP, MLPSubmodules
-    from megatron.core.transformer.spec_utils import ModuleSpec
-    from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
-
-    HAVE_MEGATRON_CORE = True
-
-except (ImportError, ModuleNotFoundError):
-
-    HAVE_MEGATRON_CORE = False
-    ModuleSpec = None
-
-
-def get_gpt_layer_ammo_spec() -> ModuleSpec:
-    assert HAVE_MEGATRON_CORE
-    return ModuleSpec(
-        module=TransformerLayer,
-        submodules=TransformerLayerSubmodules(
-            input_layernorm=TENorm,
-            self_attention=ModuleSpec(
-                module=SelfAttention,
-                params={"attn_mask_type": AttnMaskType.causal},
-                submodules=SelfAttentionSubmodules(
-                    linear_qkv=ColumnParallelLinear, core_attention=DotProductAttention, linear_proj=RowParallelLinear,
-                ),
-            ),
-            self_attn_bda=get_bias_dropout_add,
-            pre_mlp_layernorm=TENorm,
-            mlp=ModuleSpec(
-                module=MLP, submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,),
-            ),
-            mlp_bda=get_bias_dropout_add,
-            sharded_state_dict_keys_map={
-                'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
-                'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
-            },
-        ),
-    )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index ae3def28df3d..2afbed322abc 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -44,7 +44,6 @@
     get_gpt_full_te_layer_autocast_spec,
 )
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
-from nemo.collections.nlp.models.language_modeling.megatron.model_specs import get_gpt_layer_ammo_spec
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
 from nemo.collections.nlp.modules.common.megatron.build_model import build_model
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
@@ -91,6 +90,7 @@
     from megatron.core import InferenceParams, parallel_state, tensor_parallel
     from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
     from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
+    from megatron.core.deploy.gpt.model_specs import get_gpt_layer_ammo_spec
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
     from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index dc044e52870a..8e4c2c4a1386 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -111,10 +111,11 @@ def _load_model(
         except AttributeError:
             pass
 
+        self._check_ddp_initialized(model)
+
         if is_global_rank_zero():
             print(model)
 
-        self._check_ddp_initialized(model)
         return model
 
     def _check_ddp_initialized(self, model):

From ae0498dbccf89db58e66a20aee71dcb0e0d6abd4 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Fri, 8 Mar 2024 14:48:50 +0100
Subject: [PATCH 23/30] Make AMMO optional dependency and properly import guard
 it

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 Dockerfile                        |  4 +++-
 Jenkinsfile                       |  6 ++++++
 nemo/export/quantize/quantizer.py | 29 ++++++++++++++++++-----------
 reinstall.sh                      |  2 +-
 requirements/requirements_nlp.txt |  1 -
 5 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 81e8f7c73b7b..970c34a690d4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -126,12 +126,14 @@ RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_k2.sh); INSTALL
 WORKDIR /tmp/nemo
 ENV LHOTSE_REQUIRE_TORCHAUDIO=0
 COPY requirements .
-RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --extra-index-url https://pypi.nvidia.com --no-cache-dir -r $f; done
+RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done
 
 # install flash attention
 RUN pip install flash-attn
 # install numba for latest containers
 RUN pip install numba>=0.57.1
+# install ammo
+RUN pip install nvidia-ammo~=0.7.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
 
 # copy nemo source into a scratch image
 FROM scratch as nemo-src
diff --git a/Jenkinsfile b/Jenkinsfile
index 5a60530aa47e..b67fb4ac6f74 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -96,6 +96,12 @@ pipeline {
       }
     }
 
+    stage('AMMO installation') {
+      steps {
+         sh 'pip install nvidia-ammo~=0.7.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir'
+      }
+    }
+
     stage('PyTorch Lightning version') {
       steps {
         sh 'python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"'
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 8e4c2c4a1386..962529661d4e 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -16,9 +16,7 @@
 import tarfile
 from typing import List, Optional
 
-import ammo.torch.quantization as atq
 import torch.distributed as dist
-from ammo.torch.export import export_model_config
 from megatron.core import parallel_state
 from omegaconf import OmegaConf
 from omegaconf.omegaconf import DictConfig, open_dict
@@ -32,15 +30,14 @@
 from nemo.utils.get_rank import is_global_rank_zero
 from nemo.utils.model_utils import load_config, save_artifacts
 
-QUANT_CFG_CHOICES = {
-    "int8": atq.INT8_DEFAULT_CFG,
-    "int8_sq": atq.INT8_SMOOTHQUANT_CFG,
-    "fp8": atq.FP8_DEFAULT_CFG,
-    "int4_awq": atq.INT4_AWQ_CFG,
-    "w4a8_awq": atq.W4A8_AWQ_BETA_CFG,
-}
+try:
+    import ammo.torch.quantization as atq
+    from ammo.torch.export import export_model_config
+    HAVE_AMMO = True
 
-SUPPORTED_DTYPE = [16, "16", "bf16"]  # Default precision for non-quantized layers
+except (ImportError, ModuleNotFoundError) as e:
+    HAVE_AMMO = False
+    HAVE_AMMO_ERROR = e
 
 
 class Quantizer:
@@ -63,7 +60,7 @@ class Quantizer:
     the quantization command with decoder_type parameter on exporting (see below). Quantizing other
     model families is experimental and might not be fully supported.
 
-    Available quantization methods are listed in QUANT_CFG_CHOICES dictionary on top of this file.
+    Available quantization methods are listed in QUANT_CFG_CHOICES dictionary below.
     Please consult AMMO documentation for details. You can also inspect different choices in
     examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml for quantization algorithms and
     calibration data as well as recommended settings.
@@ -76,6 +73,16 @@ def __init__(
         export_config: DictConfig,
         trainer_config: DictConfig,
     ):
+        if not HAVE_AMMO:
+            raise RuntimeError("nvidia-ammo>=0.7 is needed to use Quantizer") from HAVE_AMMO_ERROR
+        QUANT_CFG_CHOICES = {
+            "int8": atq.INT8_DEFAULT_CFG,
+            "int8_sq": atq.INT8_SMOOTHQUANT_CFG,
+            "fp8": atq.FP8_DEFAULT_CFG,
+            "int4_awq": atq.INT4_AWQ_CFG,
+            "w4a8_awq": atq.W4A8_AWQ_BETA_CFG,
+        }
+        SUPPORTED_DTYPE = [16, "16", "bf16"]  # Default precision for non-quantized layers
         assert export_config.dtype in SUPPORTED_DTYPE
         assert quantization_config.algorithm in QUANT_CFG_CHOICES
         self.quantization_config = quantization_config
diff --git a/reinstall.sh b/reinstall.sh
index a5004590c7c1..d64b56103dd3 100755
--- a/reinstall.sh
+++ b/reinstall.sh
@@ -34,7 +34,7 @@ else
     ${PIP} install build pytest-runner
     python -m build --no-isolation --wheel
     DIST_FILE=$(find ./dist -name "*.whl" | head -n 1)
-    ${PIP} install --extra-index-url https://pypi.nvidia.com "${DIST_FILE}[all]"
+    ${PIP} install "${DIST_FILE}[all]"
 fi
 
 echo 'All done!'
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 984a3aa45478..2484328293e1 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -12,7 +12,6 @@ markdown2
 matplotlib>=3.3.2
 megatron_core==0.5.0
 nltk>=3.6.5
-nvidia-ammo~=0.7.0
 opencc<1.1.7
 pangu
 rapidfuzz

From b56ff60381b80d0add4456297dab0fb52b30cf1e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 8 Mar 2024 14:31:47 +0000
Subject: [PATCH 24/30] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 nemo/export/quantize/quantizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 962529661d4e..3114416be18b 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -33,6 +33,7 @@
 try:
     import ammo.torch.quantization as atq
     from ammo.torch.export import export_model_config
+
     HAVE_AMMO = True
 
 except (ImportError, ModuleNotFoundError) as e:

From 01f215d6b6c10a2d0f7267e89816ba148f12bb88 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 12 Mar 2024 09:49:07 +0100
Subject: [PATCH 25/30] Add Llama2 AWQ test and update some paths

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 Jenkinsfile             | 28 +++++++++++++++++++++-------
 tests/setup/__main__.py |  6 +++---
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index b67fb4ac6f74..67b70516ed60 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -417,7 +417,7 @@ pipeline {
           steps {
             sh 'CUDA_VISIBLE_DEVICES=0 python scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py \
             --in-file=/home/TestData/nlp/megatron_llama/llama-ci-hf \
-            --out-file=/home/TestData/nlp/megatron_llama/ci.nemo \
+            --out-file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
             --precision=16'
           }
         }
@@ -462,27 +462,41 @@ pipeline {
         stage('Llama2 - INT8 SQ') {
           steps {
             sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/ci.nemo \
+            model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
             quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
             quantization.algorithm=int8_sq \
             quantization.num_calib_size=8 \
             inference.batch_size=2 \
-            model_save=/home/TestData/nlp/megatron_llama/ci.qnemo'
-            sh 'rm -f /home/TestData/nlp/megatron_llama/ci.qnemo'
+            model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo'
+            sh 'rm -f /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo'
           }
         }
         stage('Llama2 - FP8') {
           steps {
             sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/ci.nemo \
+            model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
             tensor_model_parallel_size=2 \
             trainer.devices=2 \
             quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
             quantization.algorithm=fp8 \
             quantization.num_calib_size=8 \
             inference.batch_size=2 \
-            model_save=/home/TestData/nlp/megatron_llama/ci.qnemo'
-            sh 'rm -f /home/TestData/nlp/megatron_llama/ci.qnemo'
+            model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo'
+            sh 'rm -f /home/TestData/nlp/megatron_llama/ci_fp8.qnemo'
+          }
+        }
+        stage('Llama2 - AWQ') {
+          steps {
+            sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
+            model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
+            tensor_model_parallel_size=2 \
+            trainer.devices=2 \
+            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+            quantization.algorithm=int4_awq \
+            quantization.num_calib_size=8 \
+            inference.batch_size=2 \
+            model_save=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo'
+            sh 'rm -f /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo'
           }
         }
       }
diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py
index 51cdab795a99..289a2537e2f2 100644
--- a/tests/setup/__main__.py
+++ b/tests/setup/__main__.py
@@ -34,9 +34,9 @@
 )
 
 create_hf_model(
-    model_name_or_path="/home/TestData/nlp/megatron_llama/llama-ci-hf",  # FIXME: change to "meta-llama/Llama-2-7b-hf"
-    output_dir=os.path.join(args.save_dir, "tiny_llama2_hf"),
-    config_updates={"hidden_size": 128, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4},
+    model_name_or_path="/home/TestData/nlp/meta-llama/Llama-2-7b-hf",
+    output_dir=os.path.join(args.save_dir, "megatron_llama/llama-ci-hf"),
+    config_updates={"hidden_size": 256, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4},
     overwrite=args.overwrite,
 )
 print("Setup done.")

From fe1eeba84af7c4ac6f2be9131c91401c907a1216 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 12 Mar 2024 13:08:12 +0100
Subject: [PATCH 26/30] Enable specifying quantization.algorithm=null for
 baseline accuracy checks

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 Jenkinsfile                                   | 11 +++++++-
 .../conf/megatron_llama_quantization.yaml     |  2 +-
 .../megatron_llama_quantization.py            | 19 ++++++++-----
 nemo/export/quantize/quantizer.py             | 28 +++++++++++++------
 4 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 67b70516ed60..15b04a34b12c 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -459,6 +459,15 @@ pipeline {
       }
       failFast true
       parallel {
+        stage('Llama2 - Export Only') {
+          steps {
+            sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
+            model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
+            quantization.algorithm=null \
+            model_save=/home/TestData/nlp/megatron_llama/ci_baseline.qnemo'
+            sh 'rm -f /home/TestData/nlp/megatron_llama/ci_baseline.qnemo'
+          }
+        }
         stage('Llama2 - INT8 SQ') {
           steps {
             sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
@@ -485,7 +494,7 @@ pipeline {
             sh 'rm -f /home/TestData/nlp/megatron_llama/ci_fp8.qnemo'
           }
         }
-        stage('Llama2 - AWQ') {
+        stage('Llama2 - INT4 AWQ') {
           steps {
             sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
             model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
diff --git a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
index 5a5e87eba7c4..322ab946febe 100644
--- a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
@@ -22,7 +22,7 @@ trainer:
 
 quantization:
   quantize_bmm1: false
-  algorithm: fp8 # int8_sq, fp8, int8, int4_awq
+  algorithm: fp8 # int8_sq, fp8, int8, int4_awq, null
   calib_dataset: cnn_dailymail # pileval, wikitext, cnn_dailymail
   num_calib_size: 512 # number of samples used for calibration
 
diff --git a/examples/nlp/language_modeling/megatron_llama_quantization.py b/examples/nlp/language_modeling/megatron_llama_quantization.py
index 7b6fe88ba4f6..16fb5ae9c13b 100644
--- a/examples/nlp/language_modeling/megatron_llama_quantization.py
+++ b/examples/nlp/language_modeling/megatron_llama_quantization.py
@@ -69,13 +69,18 @@ def main(cfg) -> None:
 
     quantizer = Quantizer(cfg.quantization, cfg.inference, cfg.export, cfg.trainer)
 
-    dataloader = get_calib_dataloader(
-        cfg.quantization.calib_dataset,
-        cfg.inference.batch_size,
-        cfg.quantization.num_calib_size,
-        cfg.inference.max_context_length,
-    )
-    dataloader = [data for data in dataloader]
+    # Quantization algorithm can be set to None. This is useful for baseline precision
+    # accuracy validation. In this case only weights export step will be performed:
+    if cfg.quantization.algorithm is not None:
+        dataloader = get_calib_dataloader(
+            cfg.quantization.calib_dataset,
+            cfg.inference.batch_size,
+            cfg.quantization.num_calib_size,
+            cfg.inference.max_context_length,
+        )
+        dataloader = [data for data in dataloader]
+    else:
+        dataloader = None
 
     model = quantizer.quantize(
         cfg.model_file, dataloader, cfg.tensor_model_parallel_size, cfg.pipeline_model_parallel_size
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 3114416be18b..591848fd0adf 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -65,6 +65,9 @@ class Quantizer:
     Please consult AMMO documentation for details. You can also inspect different choices in
     examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml for quantization algorithms and
     calibration data as well as recommended settings.
+
+    Quantization algorithm can also be conveniently set to 'null' to perform only weights export step
+    for TensorRT-LLM deployment. This is useful to getting baseline results for a full-precision model.
     """
 
     def __init__(
@@ -85,17 +88,20 @@ def __init__(
         }
         SUPPORTED_DTYPE = [16, "16", "bf16"]  # Default precision for non-quantized layers
         assert export_config.dtype in SUPPORTED_DTYPE
-        assert quantization_config.algorithm in QUANT_CFG_CHOICES
+        assert quantization_config.algorithm is None or quantization_config.algorithm in QUANT_CFG_CHOICES
         self.quantization_config = quantization_config
         self.inference_config = inference_config
         self.export_config = export_config
         self.trainer_config = trainer_config
-        atq_config = QUANT_CFG_CHOICES[quantization_config.algorithm]
-        if quantization_config.algorithm != "fp8":
-            # disable quantization for the last output layer
-            atq_config = copy.deepcopy(atq_config)
-            atq_config["quant_cfg"]["*.output_layer.*"] = {"enable": False}
-        self.atq_config = atq_config
+        if quantization_config.algorithm is not None:
+            atq_config = QUANT_CFG_CHOICES[quantization_config.algorithm]
+            if quantization_config.algorithm != "fp8":
+                # disable quantization for the last output layer
+                atq_config = copy.deepcopy(atq_config)
+                atq_config["quant_cfg"]["*.output_layer.*"] = {"enable": False}
+            self.atq_config = atq_config
+        else:
+            self.atq_config = None
 
     def _load_model(
         self,
@@ -160,12 +166,16 @@ def _load_and_modify_config(
     def quantize(
         self,
         model_file: str,
-        dataloader: List[List[str]],
+        dataloader: Optional[List[List[str]]],
         tensor_model_parallel_size: Optional[int] = None,
         pipeline_model_parallel_size: Optional[int] = None,
     ):
         """Quantize model checkpoint using given dataloader and optional custom parallelism settings."""
         model = self._load_model(model_file, tensor_model_parallel_size, pipeline_model_parallel_size)
+
+        if self.quantization_config.algorithm is None:
+            return model
+
         model.set_inference_config(OmegaConf.to_container(self.inference_config))
 
         def forward_loop():
@@ -174,7 +184,7 @@ def forward_loop():
                     print(f"Calibrating batch {i}")
                 model.predict_step(batch, i)
 
-        atq.quantize(model, self.atq_config, forward_loop)
+        model = atq.quantize(model, self.atq_config, forward_loop)
         return model
 
     def export(self, model, model_save: str):

From 3a7f07ec5f8b8877f7103075b4c4e42a760ce7fa Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 12 Mar 2024 14:39:34 +0100
Subject: [PATCH 27/30] Enable exporting qnemo tarball or just to a directory

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 Jenkinsfile                       |  4 ++--
 nemo/export/quantize/quantizer.py | 28 +++++++++++++++++++---------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 15b04a34b12c..46b8f30be703 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -464,8 +464,8 @@ pipeline {
             sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
             model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
             quantization.algorithm=null \
-            model_save=/home/TestData/nlp/megatron_llama/ci_baseline.qnemo'
-            sh 'rm -f /home/TestData/nlp/megatron_llama/ci_baseline.qnemo'
+            model_save=/home/TestData/nlp/megatron_llama/ci_baseline'
+            sh 'rm -rf /home/TestData/nlp/megatron_llama/ci_baseline'
           }
         }
         stage('Llama2 - INT8 SQ') {
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 591848fd0adf..c25536208874 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -14,6 +14,7 @@
 
 import copy
 import tarfile
+from contextlib import nullcontext
 from typing import List, Optional
 
 import torch.distributed as dist
@@ -51,11 +52,11 @@ class Quantizer:
 
         1. Loading a Nemo model from disk using appropriate parallelism strategy
         2. Calibrating the model to obtain appropriate algorithm-specific scaling factors
-        3. Producing .qnemo tarball with model config (JSON), quantized weights (safetensors)
-           and tokenizer config (yaml).
+        3. Producing output directory or .qnemo tarball with model config (json),
+           quantized weights (safetensors) and tokenizer config (yaml).
 
-    The .qnemo file produced is intended consumed by TensorRT-LLM toolbox for inference.
-    This can be achieved using Nemo inference containers.
+    The output directory (or .qnemo file) produced is intended to be consumed by TensorRT-LLM toolbox
+    for efficient inference. This can be achieved using Nemo inference containers.
 
     Currently supported and tested model family is Llama2. Model type needs to be specified in
     the quantization command with decoder_type parameter on exporting (see below). Quantizing other
@@ -191,17 +192,26 @@ def export(self, model, model_save: str):
         """Export model to '.qnemo' format for TensorRT-LLM engine build."""
         torch_dtype = torch_dtype_from_precision(self.export_config.dtype)
 
-        with temporary_directory() as tmp_dir:
+        # Setup model export handling: temporary directory for
+        # '.qnemo' tarball or directly write to model_save
+        save_qnemo = model_save.endswith(".qnemo")
+        if save_qnemo:
+            export_handler = temporary_directory()
+        else:
+            export_handler = nullcontext(enter_result=model_save)
+
+        with export_handler as export_dir:
             export_model_config(
                 model=model,
                 decoder_type=self.export_config.decoder_type,
                 dtype=torch_dtype,
-                export_dir=tmp_dir,
+                export_dir=export_dir,
                 inference_tensor_parallel=self.export_config.inference_tensor_parallel,
             )
             dist.barrier()  # Wait until all ranks complete export_model_config step
             if is_global_rank_zero():
                 logging.info(f"Exporting quantized weights, model artifacts, and tokenizer config to {model_save}...")
-                with tarfile.open(model_save, "w:gz") as tar:
-                    save_artifacts(model, tmp_dir)
-                    tar.add(tmp_dir, arcname="./")
+                save_artifacts(model, export_dir)
+                if save_qnemo:
+                    with tarfile.open(model_save, "w:gz") as tar:
+                        tar.add(export_dir, arcname="./")

From ac52816ea6a7de0308277f988c050fdb6f0415c7 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 12 Mar 2024 21:51:55 +0100
Subject: [PATCH 28/30] Drop AWQ testing for now

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 Jenkinsfile | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 46b8f30be703..37e0a229bf74 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -494,20 +494,6 @@ pipeline {
             sh 'rm -f /home/TestData/nlp/megatron_llama/ci_fp8.qnemo'
           }
         }
-        stage('Llama2 - INT4 AWQ') {
-          steps {
-            sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
-            tensor_model_parallel_size=2 \
-            trainer.devices=2 \
-            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-            quantization.algorithm=int4_awq \
-            quantization.num_calib_size=8 \
-            inference.batch_size=2 \
-            model_save=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo'
-            sh 'rm -f /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo'
-          }
-        }
       }
     }
 

From 81e8e0769a17158a2e5f43ea8cc05c20db8a2bd4 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 12 Mar 2024 21:52:45 +0100
Subject: [PATCH 29/30] Test case for export.inference_tensor_parallel=2

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 Jenkinsfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Jenkinsfile b/Jenkinsfile
index 37e0a229bf74..d2fae1fc687e 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -490,6 +490,7 @@ pipeline {
             quantization.algorithm=fp8 \
             quantization.num_calib_size=8 \
             inference.batch_size=2 \
+            export.inference_tensor_parallel=2 \
             model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo'
             sh 'rm -f /home/TestData/nlp/megatron_llama/ci_fp8.qnemo'
           }

From bf03390b178f4a94494ad977eba423acab329252 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 12 Mar 2024 21:57:41 +0100
Subject: [PATCH 30/30] Flag to export TRT-LLM config.json

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 .../nlp/language_modeling/conf/megatron_llama_quantization.yaml  | 1 +
 nemo/export/quantize/quantizer.py                                | 1 +
 2 files changed, 2 insertions(+)

diff --git a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
index 322ab946febe..f3803dc4e69c 100644
--- a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
@@ -30,6 +30,7 @@ export:
   decoder_type: llama # gptnext, gpt2, llama
   inference_tensor_parallel: 1 # Default using 1 TP for inference
   dtype: 16 # Default precision data type
+  export_tensorrt_llm_config: true # export config to build TRT-LLM engine directly
 
 model_file: llama2-7b-fp16.nemo # Nemo file path
 model_save: llama2-7b-fp8.qnemo # Path where the quantized model will be saved
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index c25536208874..1ae375e6cfe7 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -207,6 +207,7 @@ def export(self, model, model_save: str):
                 dtype=torch_dtype,
                 export_dir=export_dir,
                 inference_tensor_parallel=self.export_config.inference_tensor_parallel,
+                export_tensorrt_llm_config=self.export_config.export_tensorrt_llm_config,
             )
             dist.barrier()  # Wait until all ranks complete export_model_config step
             if is_global_rank_zero():