From bf8178ee030a1eedd0456d2ffa3e2de4ec8ace63 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Mon, 24 Jun 2024 14:11:42 -0700
Subject: [PATCH 01/13] add llama

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 nemo/collections/llm/__init__.py             | 18 ++++++++++++++++++
 nemo/collections/llm/gpt/model/__init__.py   | 12 ++++++++++++
 nemo/collections/llm/gpt/model/mistral_7b.py |  3 ---
 nemo/lightning/io/connector.py               |  3 ++-
 4 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index cb8db0f5f272..2dcb778c5aef 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -20,6 +20,15 @@
     Mistral7BModel,
     MixtralConfig,
     MixtralModel,
+    LlamaConfig,
+    Llama2Config7B,
+    Llama2Config13B,
+    Llama2Config70B,
+    CodeLlamaConfig7B,
+    CodeLlamaConfig13B,
+    CodeLlamaConfig34B,
+    CodeLlamaConfig70B,
+    LlamaModel,
     gpt_data_step,
     gpt_forward_step,
 )
@@ -35,6 +44,15 @@
     "Mistral7BModel",
     "MixtralConfig",
     "MixtralModel",
+    "LlamaConfig",
+    "Llama2Config7B",
+    "Llama2Config13B",
+    "Llama2Config70B",
+    "CodeLlamaConfig7B",
+    "CodeLlamaConfig13B",
+    "CodeLlamaConfig34B",
+    "CodeLlamaConfig70B",
+    "LlamaModel",
     "PreTrainingDataModule",
     "FineTuningDataModule",
     "SquadDataModule",
diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index 0ddaa61c7a35..b1726e89f2a6 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -7,6 +7,8 @@
 )
 from nemo.collections.llm.gpt.model.mistral_7b import Mistral7BConfig, Mistral7BModel
 from nemo.collections.llm.gpt.model.mixtral import MixtralConfig, MixtralModel
+from nemo.collections.llm.gpt.model.llama import *
+
 
 __all__ = [
     "GPTConfig",
@@ -15,7 +17,17 @@
     "Mistral7BModel",
     "MixtralConfig",
     "MixtralModel",
+    "LlamaConfig",
+    "Llama2Config7B",
+    "Llama2Config13B",
+    "Llama2Config70B",
+    "CodeLlamaConfig7B",
+    "CodeLlamaConfig13B",
+    "CodeLlamaConfig34B",
+    "CodeLlamaConfig70B",
+    "LlamaModel",
     "MaskedTokenLossReduction",
     "gpt_data_step",
     "gpt_forward_step",
 ]
+
diff --git a/nemo/collections/llm/gpt/model/mistral_7b.py b/nemo/collections/llm/gpt/model/mistral_7b.py
index ada67c17da25..ff9591581f86 100644
--- a/nemo/collections/llm/gpt/model/mistral_7b.py
+++ b/nemo/collections/llm/gpt/model/mistral_7b.py
@@ -71,9 +71,6 @@ def apply(self, output_path: Path) -> Path:
 
         return output_path
 
-    def on_import_ckpt(self, model: pl.LightningModule):
-        model.tokenizer = self.tokenizer
-
     def convert_state(self, source, target):
         mapping = {
             "model.embed_tokens.weight": "embedding.word_embeddings.weight",
diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
index a6ab4afd6d1b..b813b6198477 100644
--- a/nemo/lightning/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -217,4 +217,5 @@ def local_path(self, base_path: Optional[Path] = None) -> Path:
 
         return _base / str(self).replace("://", "/")
 
-    def on_import_ckpt(self, model: pl.LightningModule): ...
+    def on_import_ckpt(self, model: pl.LightningModule):
+        model.tokenizer = self.tokenizer
\ No newline at end of file

From 2b5bce408367a9a1af96079edf7ef7c9ab6276a4 Mon Sep 17 00:00:00 2001
From: cuichenx <cuichenx@users.noreply.github.com>
Date: Mon, 24 Jun 2024 21:12:59 +0000
Subject: [PATCH 02/13] Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
---
 nemo/collections/llm/__init__.py           | 18 +++++++++---------
 nemo/collections/llm/gpt/model/__init__.py |  4 +---
 nemo/lightning/io/connector.py             |  2 +-
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 2dcb778c5aef..ff34a10502f0 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -13,22 +13,22 @@
     SquadDataModule,
 )
 from nemo.collections.llm.gpt.model import (
+    CodeLlamaConfig7B,
+    CodeLlamaConfig13B,
+    CodeLlamaConfig34B,
+    CodeLlamaConfig70B,
     GPTConfig,
     GPTModel,
+    Llama2Config7B,
+    Llama2Config13B,
+    Llama2Config70B,
+    LlamaConfig,
+    LlamaModel,
     MaskedTokenLossReduction,
     Mistral7BConfig,
     Mistral7BModel,
     MixtralConfig,
     MixtralModel,
-    LlamaConfig,
-    Llama2Config7B,
-    Llama2Config13B,
-    Llama2Config70B,
-    CodeLlamaConfig7B,
-    CodeLlamaConfig13B,
-    CodeLlamaConfig34B,
-    CodeLlamaConfig70B,
-    LlamaModel,
     gpt_data_step,
     gpt_forward_step,
 )
diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index b1726e89f2a6..6a2f075cbd4f 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -5,10 +5,9 @@
     gpt_data_step,
     gpt_forward_step,
 )
+from nemo.collections.llm.gpt.model.llama import *
 from nemo.collections.llm.gpt.model.mistral_7b import Mistral7BConfig, Mistral7BModel
 from nemo.collections.llm.gpt.model.mixtral import MixtralConfig, MixtralModel
-from nemo.collections.llm.gpt.model.llama import *
-
 
 __all__ = [
     "GPTConfig",
@@ -30,4 +29,3 @@
     "gpt_data_step",
     "gpt_forward_step",
 ]
-
diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
index b813b6198477..41c81582bb63 100644
--- a/nemo/lightning/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -218,4 +218,4 @@ def local_path(self, base_path: Optional[Path] = None) -> Path:
         return _base / str(self).replace("://", "/")
 
     def on_import_ckpt(self, model: pl.LightningModule):
-        model.tokenizer = self.tokenizer
\ No newline at end of file
+        model.tokenizer = self.tokenizer

From c56f3bb9d84fe857139fa801ada3797bd433b0f6 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Mon, 24 Jun 2024 14:18:35 -0700
Subject: [PATCH 03/13] add llama

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 nemo/collections/llm/gpt/model/llama.py | 351 ++++++++++++++++++++++++
 1 file changed, 351 insertions(+)
 create mode 100644 nemo/collections/llm/gpt/model/llama.py

diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
new file mode 100644
index 000000000000..218c318d1290
--- /dev/null
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -0,0 +1,351 @@
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Callable, Type, Optional, Annotated
+
+import torch
+import torch.nn.functional as F
+
+from nemo.lightning import io, OptimizerModule, teardown
+from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
+from nemo.collections.llm.utils import Config
+
+if TYPE_CHECKING:
+    from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+    from transformers import (
+        LlamaConfig as HFLlamaConfig,
+        LlamaForCausalLM,
+    )
+    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+
+
+# Note: these Llama configs are copied from the corresponding HF model. You may need to modify the parameter for
+# your own needs, in particular: seq_length and rotary_base.
+@dataclass
+class LlamaConfig(GPTConfig):
+    # configs that are common across model sizes
+    normalization: str = "RMSNorm"
+    activation_func: Callable = F.silu
+    gated_linear_unit: bool = True
+    position_embedding_type: str = "rope"
+    add_bias_linear: bool = False
+    seq_length: int = 4096
+
+
+@dataclass
+class Llama2Config7B(LlamaConfig):
+    num_layers: int = 32
+    hidden_size: int = 4096
+    num_attention_heads: int = 32
+    num_query_groups: int = 32
+    fnn_hidden_size: int = 11008
+
+
+@dataclass
+class Llama2Config13B(LlamaConfig):
+    num_layers: int = 40
+    hidden_size: int = 5120
+    num_attention_heads: int = 40
+    num_query_groups: int = 40
+    ffn_hidden_size: int = 13824
+
+
+@dataclass
+class Llama2Config70B(LlamaConfig):
+    num_layers: int = 80
+    hidden_size: int = 8192
+    num_attention_heads: int = 64
+    num_query_groups: int = 8
+    ffn_hidden_size: int = 28672
+
+@dataclass
+class CodeLlamaConfig7B(Llama2Config7B):
+    rotary_base: int = 1_000_000
+    seq_length: int = 16384
+
+
+@dataclass
+class CodeLlamaConfig13B(Llama2Config13B):
+    rotary_base: int = 1_000_000
+    seq_length: int = 16384
+
+
+@dataclass
+class CodeLlamaConfig34B(LlamaConfig):
+    num_layers: int = 48
+    hidden_size: int = 8192
+    num_attention_heads: int = 64
+    num_query_groups: int = 8
+    ffn_hidden_size: int = 22016
+    rotary_base: int = 1_000_000
+    seq_length: int = 16384
+
+
+@dataclass
+class CodeLlamaConfig70B(Llama2Config70B):
+    pass
+
+
+class LlamaModel(GPTModel):
+    def __init__(
+        self,
+        config: Annotated[Optional[LlamaConfig], Config[LlamaConfig]] = None,
+        optim: Optional[OptimizerModule] = None,
+        tokenizer: Optional["TokenizerSpec"] = None,
+    ):
+        super().__init__(config or LlamaConfig(), optim=optim, tokenizer=tokenizer)
+
+
+@io.model_importer(LlamaModel, "hf")
+class HFLlamaImporter(io.ModelConnector["LlamaForCausalLM", LlamaModel]):
+    def init(self) -> LlamaModel:
+        return LlamaModel(self.config, tokenizer=self.tokenizer)
+
+    def apply(self, output_path: Path) -> Path:
+        from transformers import LlamaForCausalLM
+
+        source = LlamaForCausalLM.from_pretrained(str(self))
+        target = self.init()
+        trainer = self.nemo_setup(target)
+        self.convert_state(source, target)
+        self.nemo_save(output_path, trainer)
+
+        print(f"Converted Llama model to Nemo, model saved to {output_path}")
+
+        teardown(trainer, target)
+        del trainer, target
+
+        return output_path
+    def convert_state(self, source, target):
+        mapping = {
+            "model.embed_tokens.weight": "embedding.word_embeddings.weight",
+            "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight",
+            "model.layers.*.mlp.down_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight",
+            "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+            "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+            "model.norm.weight": "decoder.final_layernorm.weight",
+            "lm_head.weight": "output_layer.weight"
+        }
+
+        return io.apply_transforms(
+            source,
+            target,
+            mapping=mapping,
+            transforms=[_import_qkv, _import_linear_fc1]
+        )
+
+    @property
+    def tokenizer(self) -> "AutoTokenizer":
+        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+        return AutoTokenizer(str(self))
+
+    @property
+    def config(self) -> LlamaConfig:
+        from transformers import LlamaConfig as HFLlamaConfig
+
+        source = HFLlamaConfig.from_pretrained(str(self))
+
+        def make_vocab_size_divisible_by(vocab_size):
+            base = 128
+            while vocab_size % base != 0:
+                base //= 2
+            return base
+
+        output = HFLlamaConfig(
+            seq_length=source.sliding_window,
+            num_layers=source.num_hidden_layers,
+            hidden_size=source.hidden_size,
+            ffn_hidden_size=source.intermediate_size,
+            num_attention_heads=source.num_attention_heads,
+            init_method_std=source.initializer_range,
+            layernorm_epsilon=source.rms_norm_eps,
+            num_query_groups=source.num_key_value_heads,
+            rotary_base=source.rope_theta,
+            gated_linear_unit=True,
+            make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size),
+            window_size=[source.sliding_window, 0],
+            share_embeddings_and_output_weights=False,
+        )
+
+        return output
+
+
+@io.model_exporter(LlamaModel, "hf")
+class HFLlamaExporter(io.ModelConnector[LlamaModel, "LlamaForCausalLM"]):
+    def init(self) -> "LlamaForCausalLM":
+        from transformers import AutoModelForCausalLM
+
+        return AutoModelForCausalLM.from_config(self.config)
+
+    def apply(self, output_path: Path) -> Path:
+        target = self.init()
+        source, _ = self.nemo_load(str(self))
+        target = self.convert_state(source, target)
+
+        target = target.cpu()
+        target.save_pretrained(output_path)
+        self.tokenizer.save_pretrained(output_path)
+
+        return output_path
+
+    def convert_state(self, source, target):
+        mapping = {
+            "embedding.word_embeddings.weight": "model.embed_tokens.weight",
+            "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
+            "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight",
+            "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
+            "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
+            "decoder.final_layernorm.weight": "model.norm.weight",
+            "output_layer.weight": "lm_head.weight"
+        }
+
+        return io.apply_transforms(
+            source,
+            target,
+            mapping=mapping,
+            transforms=[_export_qkv, _export_linear_fc1]
+        )
+
+    @property
+    def tokenizer(self):
+        return io.load_ckpt(str(self)).model.tokenizer.tokenizer
+
+    @property
+    def config(self) -> "HFLlamaConfig":
+        source: LlamaConfig = io.load_ckpt(str(self)).model.config
+
+        from transformers import LlamaConfig as HFLlamaConfig
+
+        return HFLlamaConfig(
+            sliding_window=source.window_size[0],
+            num_hidden_layers=source.num_layers,
+            hidden_size=source.hidden_size,
+            intermediate_size=source.ffn_hidden_size,
+            num_attention_heads=source.num_attention_heads,
+            max_position_embeddings=source.seq_length,
+            initializer_range=source.init_method_std,
+            rms_norm_eps=source.layernorm_epsilon,
+            num_key_value_heads=source.num_query_groups,
+            rope_theta=source.rotary_base,
+            vocab_size=self.tokenizer.vocab_size,
+        )
+
+
+@io.state_transform(
+    source_key=(
+            "model.layers.*.self_attn.q_proj.weight",
+            "model.layers.*.self_attn.k_proj.weight",
+            "model.layers.*.self_attn.v_proj.weight",
+    ),
+    target_key="decoder.layers.*.self_attention.linear_qkv.weight"
+)
+def _import_qkv(ctx: io.TransformCTX, q, k, v):
+    megatron_config = ctx.target.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+
+    old_tensor_shape = q.size()
+    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_kv_tensor_shape)
+    v = v.view(*new_kv_tensor_shape)
+
+    qkv_weights_l = []
+    for i in range(num_query_groups):
+        qkv_weights_l.append(
+            q[i * heads_per_group: (i + 1) * heads_per_group, :, :]
+        )
+        qkv_weights_l.append(k[i: i + 1, :, :])
+        qkv_weights_l.append(v[i: i + 1, :, :])
+    qkv_weights = torch.cat(qkv_weights_l)
+    assert qkv_weights.ndim == 3, qkv_weights.shape
+    assert (
+            qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups
+    ), qkv_weights.shape
+    assert qkv_weights.shape[1] == head_size, qkv_weights.shape
+    assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
+
+    qkv_weights = qkv_weights.reshape(
+        [head_size * (head_num + 2 * num_query_groups), hidden_size]
+    )
+
+    return qkv_weights
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.self_attention.linear_qkv.weight",
+    target_key=(
+            "model.layers.*.self_attn.q_proj.weight",
+            "model.layers.*.self_attn.k_proj.weight",
+            "model.layers.*.self_attn.v_proj.weight",
+    ),
+)
+def _export_qkv(ctx: io.TransformCTX, linear_qkv):
+    megatron_config = ctx.source.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size])
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            for i in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+    q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
+    k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
+    v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()
+
+    return q_proj, k_proj, v_proj
+
+
+@io.state_transform(
+    source_key=(
+            "model.layers.*.mlp.gate_proj.weight",
+            "model.layers.*.mlp.up_proj.weight"
+    ),
+    target_key="decoder.layers.*.mlp.linear_fc1.weight"
+)
+def _import_linear_fc1(down, gate):
+    return torch.cat((down, gate), axis=0).float()
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.mlp.linear_fc1.weight",
+    target_key=(
+            "model.layers.*.mlp.gate_proj.weight",
+            "model.layers.*.mlp.up_proj.weight"
+    ),
+)
+def _export_linear_fc1(linear_fc1):
+    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
+
+    return gate_proj, up_proj
+
+
+__all__ = [
+    "LlamaConfig",
+    "Llama2Config7B",
+    "Llama2Config13B",
+    "Llama2Config70B",
+    "CodeLlamaConfig7B",
+    "CodeLlamaConfig13B",
+    "CodeLlamaConfig34B",
+    "CodeLlamaConfig70B",
+    "LlamaModel",
+]

From a74fa0787593b737b509d4cc35eb707e18da9c61 Mon Sep 17 00:00:00 2001
From: cuichenx <cuichenx@users.noreply.github.com>
Date: Mon, 24 Jun 2024 21:19:15 +0000
Subject: [PATCH 04/13] Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
---
 nemo/collections/llm/gpt/model/llama.py | 74 +++++++++----------------
 1 file changed, 27 insertions(+), 47 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index 218c318d1290..16158bbc96a4 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -1,20 +1,19 @@
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Callable, Type, Optional, Annotated
+from typing import TYPE_CHECKING, Annotated, Callable, Optional, Type
 
 import torch
 import torch.nn.functional as F
 
-from nemo.lightning import io, OptimizerModule, teardown
 from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
 from nemo.collections.llm.utils import Config
+from nemo.lightning import OptimizerModule, io, teardown
 
 if TYPE_CHECKING:
+    from transformers import LlamaConfig as HFLlamaConfig
+    from transformers import LlamaForCausalLM
+
     from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
-    from transformers import (
-        LlamaConfig as HFLlamaConfig,
-        LlamaForCausalLM,
-    )
     from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 
 
@@ -57,6 +56,7 @@ class Llama2Config70B(LlamaConfig):
     num_query_groups: int = 8
     ffn_hidden_size: int = 28672
 
+
 @dataclass
 class CodeLlamaConfig7B(Llama2Config7B):
     rotary_base: int = 1_000_000
@@ -115,6 +115,7 @@ def apply(self, output_path: Path) -> Path:
         del trainer, target
 
         return output_path
+
     def convert_state(self, source, target):
         mapping = {
             "model.embed_tokens.weight": "embedding.word_embeddings.weight",
@@ -123,19 +124,15 @@ def convert_state(self, source, target):
             "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
             "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
             "model.norm.weight": "decoder.final_layernorm.weight",
-            "lm_head.weight": "output_layer.weight"
+            "lm_head.weight": "output_layer.weight",
         }
 
-        return io.apply_transforms(
-            source,
-            target,
-            mapping=mapping,
-            transforms=[_import_qkv, _import_linear_fc1]
-        )
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1])
 
     @property
     def tokenizer(self) -> "AutoTokenizer":
         from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
         return AutoTokenizer(str(self))
 
     @property
@@ -195,15 +192,10 @@ def convert_state(self, source, target):
             "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
             "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
             "decoder.final_layernorm.weight": "model.norm.weight",
-            "output_layer.weight": "lm_head.weight"
+            "output_layer.weight": "lm_head.weight",
         }
 
-        return io.apply_transforms(
-            source,
-            target,
-            mapping=mapping,
-            transforms=[_export_qkv, _export_linear_fc1]
-        )
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1])
 
     @property
     def tokenizer(self):
@@ -232,11 +224,11 @@ def config(self) -> "HFLlamaConfig":
 
 @io.state_transform(
     source_key=(
-            "model.layers.*.self_attn.q_proj.weight",
-            "model.layers.*.self_attn.k_proj.weight",
-            "model.layers.*.self_attn.v_proj.weight",
+        "model.layers.*.self_attn.q_proj.weight",
+        "model.layers.*.self_attn.k_proj.weight",
+        "model.layers.*.self_attn.v_proj.weight",
     ),
-    target_key="decoder.layers.*.self_attention.linear_qkv.weight"
+    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
 )
 def _import_qkv(ctx: io.TransformCTX, q, k, v):
     megatron_config = ctx.target.config
@@ -258,22 +250,16 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v):
 
     qkv_weights_l = []
     for i in range(num_query_groups):
-        qkv_weights_l.append(
-            q[i * heads_per_group: (i + 1) * heads_per_group, :, :]
-        )
-        qkv_weights_l.append(k[i: i + 1, :, :])
-        qkv_weights_l.append(v[i: i + 1, :, :])
+        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
+        qkv_weights_l.append(k[i : i + 1, :, :])
+        qkv_weights_l.append(v[i : i + 1, :, :])
     qkv_weights = torch.cat(qkv_weights_l)
     assert qkv_weights.ndim == 3, qkv_weights.shape
-    assert (
-            qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups
-    ), qkv_weights.shape
+    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
     assert qkv_weights.shape[1] == head_size, qkv_weights.shape
     assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
 
-    qkv_weights = qkv_weights.reshape(
-        [head_size * (head_num + 2 * num_query_groups), hidden_size]
-    )
+    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
 
     return qkv_weights
 
@@ -281,9 +267,9 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v):
 @io.state_transform(
     source_key="decoder.layers.*.self_attention.linear_qkv.weight",
     target_key=(
-            "model.layers.*.self_attn.q_proj.weight",
-            "model.layers.*.self_attn.k_proj.weight",
-            "model.layers.*.self_attn.v_proj.weight",
+        "model.layers.*.self_attn.q_proj.weight",
+        "model.layers.*.self_attn.k_proj.weight",
+        "model.layers.*.self_attn.v_proj.weight",
     ),
 )
 def _export_qkv(ctx: io.TransformCTX, linear_qkv):
@@ -315,11 +301,8 @@ def _export_qkv(ctx: io.TransformCTX, linear_qkv):
 
 
 @io.state_transform(
-    source_key=(
-            "model.layers.*.mlp.gate_proj.weight",
-            "model.layers.*.mlp.up_proj.weight"
-    ),
-    target_key="decoder.layers.*.mlp.linear_fc1.weight"
+    source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
+    target_key="decoder.layers.*.mlp.linear_fc1.weight",
 )
 def _import_linear_fc1(down, gate):
     return torch.cat((down, gate), axis=0).float()
@@ -327,10 +310,7 @@ def _import_linear_fc1(down, gate):
 
 @io.state_transform(
     source_key="decoder.layers.*.mlp.linear_fc1.weight",
-    target_key=(
-            "model.layers.*.mlp.gate_proj.weight",
-            "model.layers.*.mlp.up_proj.weight"
-    ),
+    target_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
 )
 def _export_linear_fc1(linear_fc1):
     gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)

From 5acc3c69e0fd0355b752456c29c70fc313f91563 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Mon, 24 Jun 2024 14:28:19 -0700
Subject: [PATCH 05/13] add llama3

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 nemo/collections/llm/__init__.py           |  4 ++++
 nemo/collections/llm/gpt/model/__init__.py |  2 ++
 nemo/collections/llm/gpt/model/llama.py    | 13 +++++++++++++
 3 files changed, 19 insertions(+)

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index ff34a10502f0..0f0807950480 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -22,6 +22,8 @@
     Llama2Config7B,
     Llama2Config13B,
     Llama2Config70B,
+    Llama3Config8B,
+    Llama3Config70B,
     LlamaConfig,
     LlamaModel,
     MaskedTokenLossReduction,
@@ -48,6 +50,8 @@
     "Llama2Config7B",
     "Llama2Config13B",
     "Llama2Config70B",
+    "Llama3Config8B",
+    "Llama3Config70B",
     "CodeLlamaConfig7B",
     "CodeLlamaConfig13B",
     "CodeLlamaConfig34B",
diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index 6a2f075cbd4f..671fb20c5263 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -20,6 +20,8 @@
     "Llama2Config7B",
     "Llama2Config13B",
     "Llama2Config70B",
+    "Llama3Config8B",
+    "Llama3Config70B",
     "CodeLlamaConfig7B",
     "CodeLlamaConfig13B",
     "CodeLlamaConfig34B",
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index 218c318d1290..fb69c69dc4a3 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -57,6 +57,17 @@ class Llama2Config70B(LlamaConfig):
     num_query_groups: int = 8
     ffn_hidden_size: int = 28672
 
+@dataclass
+class Llama3Config8B(Llama2Config7B):
+    seq_length: int = 8192
+    num_query_groups: int = 8
+    fnn_hidden_size: int = 14336
+
+@dataclass
+class Llama3Config70B(Llama2Config70B):
+    seq_length: int = 8192
+
+
 @dataclass
 class CodeLlamaConfig7B(Llama2Config7B):
     rotary_base: int = 1_000_000
@@ -343,6 +354,8 @@ def _export_linear_fc1(linear_fc1):
     "Llama2Config7B",
     "Llama2Config13B",
     "Llama2Config70B",
+    "Llama3Config8B",
+    "Llama3Config70B",
     "CodeLlamaConfig7B",
     "CodeLlamaConfig13B",
     "CodeLlamaConfig34B",

From 7b4fe07657e6c1a364d195c7dc3d9941cb542cb7 Mon Sep 17 00:00:00 2001
From: cuichenx <cuichenx@users.noreply.github.com>
Date: Mon, 24 Jun 2024 21:29:11 +0000
Subject: [PATCH 06/13] Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
---
 nemo/collections/llm/gpt/model/llama.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index 824e70fc6468..251c9e36dbc0 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -63,6 +63,7 @@ class Llama3Config8B(Llama2Config7B):
     num_query_groups: int = 8
     fnn_hidden_size: int = 14336
 
+
 @dataclass
 class Llama3Config70B(Llama2Config70B):
     seq_length: int = 8192

From 5118b63011a3b8ebeacca9b5c5a8e0ec4bd03ccd Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Mon, 24 Jun 2024 14:52:47 -0700
Subject: [PATCH 07/13] fix typo

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 nemo/collections/llm/gpt/model/llama.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index 824e70fc6468..4211c57fd519 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -36,7 +36,7 @@ class Llama2Config7B(LlamaConfig):
     hidden_size: int = 4096
     num_attention_heads: int = 32
     num_query_groups: int = 32
-    fnn_hidden_size: int = 11008
+    ffn_hidden_size: int = 11008
 
 
 @dataclass
@@ -61,7 +61,7 @@ class Llama2Config70B(LlamaConfig):
 class Llama3Config8B(Llama2Config7B):
     seq_length: int = 8192
     num_query_groups: int = 8
-    fnn_hidden_size: int = 14336
+    ffn_hidden_size: int = 14336
 
 @dataclass
 class Llama3Config70B(Llama2Config70B):
@@ -158,8 +158,7 @@ def make_vocab_size_divisible_by(vocab_size):
                 base //= 2
             return base
 
-        output = HFLlamaConfig(
-            seq_length=source.sliding_window,
+        output = LlamaConfig(
             num_layers=source.num_hidden_layers,
             hidden_size=source.hidden_size,
             ffn_hidden_size=source.intermediate_size,
@@ -170,7 +169,6 @@ def make_vocab_size_divisible_by(vocab_size):
             rotary_base=source.rope_theta,
             gated_linear_unit=True,
             make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size),
-            window_size=[source.sliding_window, 0],
             share_embeddings_and_output_weights=False,
         )
 
@@ -219,7 +217,6 @@ def config(self) -> "HFLlamaConfig":
         from transformers import LlamaConfig as HFLlamaConfig
 
         return HFLlamaConfig(
-            sliding_window=source.window_size[0],
             num_hidden_layers=source.num_layers,
             hidden_size=source.hidden_size,
             intermediate_size=source.ffn_hidden_size,

From 52321f41673aa350d9696924657485a24ae3b335 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Mon, 24 Jun 2024 14:53:05 -0700
Subject: [PATCH 08/13] enable importers with multiple models

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 nemo/lightning/io/mixin.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
index 62b9a165c542..b286341cc1cd 100644
--- a/nemo/lightning/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -198,7 +198,7 @@ def register_importer(cls, ext: str, default_path: Optional[str] = None) -> Call
         """
 
         def decorator(connector: Type[ConnT]) -> Type[ConnT]:
-            cls._IMPORTERS[ext] = connector
+            cls._IMPORTERS[str(cls)+ext] = connector
             if default_path:
                 connector.default_path = default_path
             return connector
@@ -221,7 +221,7 @@ def register_exporter(cls, ext: str, default_path: Optional[str] = None) -> Call
         """
 
         def decorator(connector: Type[ConnT]) -> Type[ConnT]:
-            cls._EXPORTERS[ext] = connector
+            cls._EXPORTERS[str(cls)+ext] = connector
             if default_path:
                 connector.default_path = default_path
             return connector
@@ -310,7 +310,7 @@ def _get_connector(cls, ext, path=None, importer=True) -> ModelConnector:
         else:
             _path = path
 
-        connector = cls._IMPORTERS.get(ext) if importer else cls._EXPORTERS.get(ext)
+        connector = cls._IMPORTERS.get(str(cls)+ext) if importer else cls._EXPORTERS.get(str(cls)+ext)
         if not connector:
             raise ValueError(f"No connector found for extension '{ext}'")
 

From 1765002bcff660a8804610891d2f6fb5e593a09e Mon Sep 17 00:00:00 2001
From: cuichenx <cuichenx@users.noreply.github.com>
Date: Mon, 24 Jun 2024 21:54:19 +0000
Subject: [PATCH 09/13] Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
---
 nemo/lightning/io/mixin.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
index b286341cc1cd..54b6e7195bc9 100644
--- a/nemo/lightning/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -198,7 +198,7 @@ def register_importer(cls, ext: str, default_path: Optional[str] = None) -> Call
         """
 
         def decorator(connector: Type[ConnT]) -> Type[ConnT]:
-            cls._IMPORTERS[str(cls)+ext] = connector
+            cls._IMPORTERS[str(cls) + ext] = connector
             if default_path:
                 connector.default_path = default_path
             return connector
@@ -221,7 +221,7 @@ def register_exporter(cls, ext: str, default_path: Optional[str] = None) -> Call
         """
 
         def decorator(connector: Type[ConnT]) -> Type[ConnT]:
-            cls._EXPORTERS[str(cls)+ext] = connector
+            cls._EXPORTERS[str(cls) + ext] = connector
             if default_path:
                 connector.default_path = default_path
             return connector
@@ -310,7 +310,7 @@ def _get_connector(cls, ext, path=None, importer=True) -> ModelConnector:
         else:
             _path = path
 
-        connector = cls._IMPORTERS.get(str(cls)+ext) if importer else cls._EXPORTERS.get(str(cls)+ext)
+        connector = cls._IMPORTERS.get(str(cls) + ext) if importer else cls._EXPORTERS.get(str(cls) + ext)
         if not connector:
             raise ValueError(f"No connector found for extension '{ext}'")
 

From 9051f4d834b4f0b5c398953e52ac5b79fe0c81c0 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Mon, 24 Jun 2024 15:42:13 -0700
Subject: [PATCH 10/13] add gemma

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 nemo/collections/llm/__init__.py           |  12 +
 nemo/collections/llm/gpt/model/__init__.py |   7 +
 nemo/collections/llm/gpt/model/gemma.py    | 322 +++++++++++++++++++++
 3 files changed, 341 insertions(+)
 create mode 100644 nemo/collections/llm/gpt/model/gemma.py

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 0f0807950480..2fbac2304569 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -26,6 +26,12 @@
     Llama3Config70B,
     LlamaConfig,
     LlamaModel,
+    GemmaConfig,
+    GemmaConfig2B,
+    GemmaConfig7B,
+    CodeGemmaConfig2B,
+    CodeGemmaConfig7B,
+    GemmaModel,
     MaskedTokenLossReduction,
     Mistral7BConfig,
     Mistral7BModel,
@@ -57,6 +63,12 @@
     "CodeLlamaConfig34B",
     "CodeLlamaConfig70B",
     "LlamaModel",
+    "GemmaConfig",
+    "GemmaConfig2B",
+    "GemmaConfig7B",
+    "CodeGemmaConfig2B",
+    "CodeGemmaConfig7B",
+    "GemmaModel",
     "PreTrainingDataModule",
     "FineTuningDataModule",
     "SquadDataModule",
diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index 671fb20c5263..334ee05bd5df 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -6,6 +6,7 @@
     gpt_forward_step,
 )
 from nemo.collections.llm.gpt.model.llama import *
+from nemo.collections.llm.gpt.model.gemma import *
 from nemo.collections.llm.gpt.model.mistral_7b import Mistral7BConfig, Mistral7BModel
 from nemo.collections.llm.gpt.model.mixtral import MixtralConfig, MixtralModel
 
@@ -26,6 +27,12 @@
     "CodeLlamaConfig13B",
     "CodeLlamaConfig34B",
     "CodeLlamaConfig70B",
+    "GemmaConfig",
+    "GemmaConfig2B",
+    "GemmaConfig7B",
+    "CodeGemmaConfig2B",
+    "CodeGemmaConfig7B",
+    "GemmaModel",
     "LlamaModel",
     "MaskedTokenLossReduction",
     "gpt_data_step",
diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py
new file mode 100644
index 000000000000..8e0c4f54278f
--- /dev/null
+++ b/nemo/collections/llm/gpt/model/gemma.py
@@ -0,0 +1,322 @@
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Callable, Generic, Optional, Type, TypeVar, Annotated
+
+import torch
+from nemo.collections.nlp.modules.common.megatron.utils import openai_gelu
+
+from nemo.lightning import io, OptimizerModule, teardown
+from nemo.lightning.io.connector import TargetT
+from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
+from nemo.collections.llm.utils import Config
+
+
+if TYPE_CHECKING:
+    from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+    from transformers import (
+        GemmaForCausalLM,
+    )
+    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+
+
+# Note: Gemma requires huggingface transformers >= 4.38
+# Note: these Gemma configs are copied from the corresponding HF model. You may need to modify the parameter for
+# your own needs, in particular: seq_length and rotary_base.
+@dataclass
+class GemmaConfig(GPTConfig):
+    # configs that are common across model sizes
+    normalization: str = "RMSNorm"
+    activation_func: Callable = openai_gelu
+    gated_linear_unit: bool = True
+    position_embedding_type: str = "rope"
+    add_bias_linear: bool = False
+    seq_length: int = 8192
+    kv_channels: int = 256
+    share_embeddings_and_output_weights: bool = True
+    # Note: different behavior compared to Legacy NeMo
+    # Legacy NeMo does not set layernorm_zero_centered_gamma and instead adds 1 in the HF -> NeMo conversion script
+    # The present implementation is more in line with the official implementation
+    layernorm_zero_centered_gamma: bool = True
+
+
+@dataclass
+class GemmaConfig2B(GemmaConfig):
+    num_layers: int = 18
+    hidden_size: int = 2048
+    num_attention_heads: int = 8
+    num_query_groups: int = 1
+    ffn_hidden_size: int = 16384
+
+
+@dataclass
+class GemmaConfig7B(GemmaConfig):
+    num_layers: int = 28
+    hidden_size: int = 3072
+    num_attention_heads: int = 16
+    num_query_groups: int = 16
+    ffn_hidden_size: int = 24576
+
+
+class CodeGemmaConfig2B(GemmaConfig2B):
+    pass
+
+
+class CodeGemmaConfig7B(GemmaConfig7B):
+    pass
+
+
+class GemmaModel(GPTModel):
+    def __init__(
+        self,
+        config: Annotated[Optional[GemmaConfig], Config[GemmaConfig]] = None,
+        optim: Optional[OptimizerModule] = None,
+        tokenizer: Optional["TokenizerSpec"] = None,
+    ):
+        super().__init__(config or GemmaConfig(), optim=optim, tokenizer=tokenizer)
+
+@io.model_importer(GemmaModel, "hf")
+class HFGemmaImporter(io.ModelConnector["GemmaForCausalLM", GemmaModel]):
+    def init(self) -> GemmaModel:
+        return GemmaModel(self.config, tokenizer=self.tokenizer)
+
+    def apply(self, output_path: Path) -> Path:
+        from transformers import GemmaForCausalLM
+
+        source = GemmaForCausalLM.from_pretrained(str(self))
+        target = self.init()
+        trainer = self.nemo_setup(target)
+        self.convert_state(source, target)
+        self.nemo_save(output_path, trainer)
+
+        print(f"Converted Gemma model to Nemo, model saved to {output_path}")
+
+        teardown(trainer, target)
+        del trainer, target
+
+        return output_path
+
+    def convert_state(self, source, target):
+        mapping = {
+            "model.embed_tokens.weight": "embedding.word_embeddings.weight",
+            "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight",
+            "model.layers.*.mlp.down_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight",
+            "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+            "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+            "model.norm.weight": "decoder.final_layernorm.weight",
+        }
+
+        return io.apply_transforms(
+            source,
+            target,
+            mapping=mapping,
+            transforms=[_import_qkv, _import_linear_fc1]
+        )
+
+    @property
+    def tokenizer(self) -> "AutoTokenizer":
+        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
+        return AutoTokenizer(str(self))
+
+    @property
+    def config(self) -> GemmaConfig:
+        from transformers import GemmaConfig as HFGemmaConfig
+        source = HFGemmaConfig.from_pretrained(str(self))
+
+        def make_vocab_size_divisible_by(vocab_size):
+            base = 128
+            while vocab_size % base != 0:
+                base //= 2
+            return base
+
+        output = GemmaConfig(
+            num_layers=source.num_hidden_layers,
+            hidden_size=source.hidden_size,
+            ffn_hidden_size=source.intermediate_size,
+            num_attention_heads=source.num_attention_heads,
+            init_method_std=source.initializer_range,
+            layernorm_epsilon=source.rms_norm_eps,
+            num_query_groups=source.num_key_value_heads,
+            rotary_base=source.rope_theta,
+            gated_linear_unit=True,
+            make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size),
+            share_embeddings_and_output_weights=False,
+        )
+
+        return output
+
+
+@io.model_exporter(GemmaModel, "hf")
+class HFGemmaExporter(io.ModelConnector[GemmaModel, "GemmaForCausalLM"]):
+    def init(self) -> "GemmaForCausalLM":
+        from transformers import AutoModelForCausalLM
+
+        return AutoModelForCausalLM.from_config(self.config)
+
+    def apply(self, output_path: Path) -> Path:
+        target = self.init()
+        source, _ = self.nemo_load(str(self))
+        target = self.convert_state(source, target)
+
+        target = target.cpu()
+        target.save_pretrained(output_path)
+        self.tokenizer.save_pretrained(output_path)
+
+        return output_path
+
+    def convert_state(self, source, target):
+        mapping = {
+            "embedding.word_embeddings.weight": "model.embed_tokens.weight",
+            "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
+            "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight",
+            "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
+            "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
+            "decoder.final_layernorm.weight": "model.norm.weight",
+        }
+
+        return io.apply_transforms(
+            source,
+            target,
+            mapping=mapping,
+            transforms=[_export_qkv, _export_linear_fc1]
+        )
+
+    @property
+    def tokenizer(self):
+        return io.load_ckpt(str(self)).model.tokenizer.tokenizer
+
+    @property
+    def config(self) -> "GemmaConfig":
+        source: GemmaConfig = io.load_ckpt(str(self)).model.config
+
+        from transformers import GemmaConfig as HFGemmaConfig
+
+        return HFGemmaConfig(
+            num_hidden_layers=source.num_layers,
+            hidden_size=source.hidden_size,
+            intermediate_size=source.ffn_hidden_size,
+            num_attention_heads=source.num_attention_heads,
+            max_position_embeddings=source.seq_length,
+            initializer_range=source.init_method_std,
+            rms_norm_eps=source.layernorm_epsilon,
+            num_key_value_heads=source.num_query_groups,
+            vocab_size=self.tokenizer.vocab_size
+        )
+
+
+@io.state_transform(
+    source_key=(
+            "model.layers.*.self_attn.q_proj.weight",
+            "model.layers.*.self_attn.k_proj.weight",
+            "model.layers.*.self_attn.v_proj.weight",
+    ),
+    target_key="decoder.layers.*.self_attention.linear_qkv.weight"
+)
+def _import_qkv(ctx: io.TransformCTX, q, k, v):
+    megatron_config = ctx.target.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+
+    old_tensor_shape = q.size()
+    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_kv_tensor_shape)
+    v = v.view(*new_kv_tensor_shape)
+
+    qkv_weights_l = []
+    for i in range(num_query_groups):
+        qkv_weights_l.append(
+            q[i * heads_per_group: (i + 1) * heads_per_group, :, :]
+        )
+        qkv_weights_l.append(k[i: i + 1, :, :])
+        qkv_weights_l.append(v[i: i + 1, :, :])
+    qkv_weights = torch.cat(qkv_weights_l)
+    assert qkv_weights.ndim == 3, qkv_weights.shape
+    assert (
+            qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups
+    ), qkv_weights.shape
+    assert qkv_weights.shape[1] == head_size, qkv_weights.shape
+    assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
+
+    qkv_weights = qkv_weights.reshape(
+        [head_size * (head_num + 2 * num_query_groups), hidden_size]
+    )
+
+    return qkv_weights
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.self_attention.linear_qkv.weight",
+    target_key=(
+            "model.layers.*.self_attn.q_proj.weight",
+            "model.layers.*.self_attn.k_proj.weight",
+            "model.layers.*.self_attn.v_proj.weight",
+    ),
+)
+def _export_qkv(ctx: io.TransformCTX, linear_qkv):
+    megatron_config = ctx.source.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size])
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            for i in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+    q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
+    k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
+    v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()
+
+    return q_proj, k_proj, v_proj
+
+
+@io.state_transform(
+    source_key=(
+            "model.layers.*.mlp.gate_proj.weight",
+            "model.layers.*.mlp.up_proj.weight"
+    ),
+    target_key="decoder.layers.*.mlp.linear_fc1.weight"
+)
+def _import_linear_fc1(down, gate):
+    return torch.cat((down, gate), axis=0).float()
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.mlp.linear_fc1.weight",
+    target_key=(
+            "model.layers.*.mlp.gate_proj.weight",
+            "model.layers.*.mlp.up_proj.weight"
+    ),
+)
+def _export_linear_fc1(linear_fc1):
+    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
+
+    return gate_proj, up_proj
+
+
+__all__ = [
+    "GemmaConfig",
+    "GemmaConfig2B",
+    "GemmaConfig7B",
+    "CodeGemmaConfig2B",
+    "CodeGemmaConfig7B",
+    "GemmaModel",
+]
\ No newline at end of file

From b8d13c27cec172b38e06991c0e82680e47535466 Mon Sep 17 00:00:00 2001
From: cuichenx <cuichenx@users.noreply.github.com>
Date: Mon, 24 Jun 2024 22:42:54 +0000
Subject: [PATCH 11/13] Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
---
 nemo/collections/llm/__init__.py           | 12 ++--
 nemo/collections/llm/gpt/model/__init__.py |  2 +-
 nemo/collections/llm/gpt/model/gemma.py    | 76 ++++++++--------------
 3 files changed, 34 insertions(+), 56 deletions(-)

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 2fbac2304569..19911b544f43 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -13,10 +13,16 @@
     SquadDataModule,
 )
 from nemo.collections.llm.gpt.model import (
+    CodeGemmaConfig2B,
+    CodeGemmaConfig7B,
     CodeLlamaConfig7B,
     CodeLlamaConfig13B,
     CodeLlamaConfig34B,
     CodeLlamaConfig70B,
+    GemmaConfig,
+    GemmaConfig2B,
+    GemmaConfig7B,
+    GemmaModel,
     GPTConfig,
     GPTModel,
     Llama2Config7B,
@@ -26,12 +32,6 @@
     Llama3Config70B,
     LlamaConfig,
     LlamaModel,
-    GemmaConfig,
-    GemmaConfig2B,
-    GemmaConfig7B,
-    CodeGemmaConfig2B,
-    CodeGemmaConfig7B,
-    GemmaModel,
     MaskedTokenLossReduction,
     Mistral7BConfig,
     Mistral7BModel,
diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index 334ee05bd5df..2da72539fd15 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -5,8 +5,8 @@
     gpt_data_step,
     gpt_forward_step,
 )
-from nemo.collections.llm.gpt.model.llama import *
 from nemo.collections.llm.gpt.model.gemma import *
+from nemo.collections.llm.gpt.model.llama import *
 from nemo.collections.llm.gpt.model.mistral_7b import Mistral7BConfig, Mistral7BModel
 from nemo.collections.llm.gpt.model.mixtral import MixtralConfig, MixtralModel
 
diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py
index 8e0c4f54278f..f933676c0c63 100644
--- a/nemo/collections/llm/gpt/model/gemma.py
+++ b/nemo/collections/llm/gpt/model/gemma.py
@@ -1,21 +1,19 @@
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Callable, Generic, Optional, Type, TypeVar, Annotated
+from typing import TYPE_CHECKING, Annotated, Callable, Generic, Optional, Type, TypeVar
 
 import torch
-from nemo.collections.nlp.modules.common.megatron.utils import openai_gelu
 
-from nemo.lightning import io, OptimizerModule, teardown
-from nemo.lightning.io.connector import TargetT
 from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
 from nemo.collections.llm.utils import Config
-
+from nemo.collections.nlp.modules.common.megatron.utils import openai_gelu
+from nemo.lightning import OptimizerModule, io, teardown
+from nemo.lightning.io.connector import TargetT
 
 if TYPE_CHECKING:
+    from transformers import GemmaForCausalLM
+
     from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
-    from transformers import (
-        GemmaForCausalLM,
-    )
     from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 
 
@@ -74,6 +72,7 @@ def __init__(
     ):
         super().__init__(config or GemmaConfig(), optim=optim, tokenizer=tokenizer)
 
+
 @io.model_importer(GemmaModel, "hf")
 class HFGemmaImporter(io.ModelConnector["GemmaForCausalLM", GemmaModel]):
     def init(self) -> GemmaModel:
@@ -105,12 +104,7 @@ def convert_state(self, source, target):
             "model.norm.weight": "decoder.final_layernorm.weight",
         }
 
-        return io.apply_transforms(
-            source,
-            target,
-            mapping=mapping,
-            transforms=[_import_qkv, _import_linear_fc1]
-        )
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1])
 
     @property
     def tokenizer(self) -> "AutoTokenizer":
@@ -121,6 +115,7 @@ def tokenizer(self) -> "AutoTokenizer":
     @property
     def config(self) -> GemmaConfig:
         from transformers import GemmaConfig as HFGemmaConfig
+
         source = HFGemmaConfig.from_pretrained(str(self))
 
         def make_vocab_size_divisible_by(vocab_size):
@@ -174,12 +169,7 @@ def convert_state(self, source, target):
             "decoder.final_layernorm.weight": "model.norm.weight",
         }
 
-        return io.apply_transforms(
-            source,
-            target,
-            mapping=mapping,
-            transforms=[_export_qkv, _export_linear_fc1]
-        )
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1])
 
     @property
     def tokenizer(self):
@@ -200,17 +190,17 @@ def config(self) -> "GemmaConfig":
             initializer_range=source.init_method_std,
             rms_norm_eps=source.layernorm_epsilon,
             num_key_value_heads=source.num_query_groups,
-            vocab_size=self.tokenizer.vocab_size
+            vocab_size=self.tokenizer.vocab_size,
         )
 
 
 @io.state_transform(
     source_key=(
-            "model.layers.*.self_attn.q_proj.weight",
-            "model.layers.*.self_attn.k_proj.weight",
-            "model.layers.*.self_attn.v_proj.weight",
+        "model.layers.*.self_attn.q_proj.weight",
+        "model.layers.*.self_attn.k_proj.weight",
+        "model.layers.*.self_attn.v_proj.weight",
     ),
-    target_key="decoder.layers.*.self_attention.linear_qkv.weight"
+    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
 )
 def _import_qkv(ctx: io.TransformCTX, q, k, v):
     megatron_config = ctx.target.config
@@ -232,22 +222,16 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v):
 
     qkv_weights_l = []
     for i in range(num_query_groups):
-        qkv_weights_l.append(
-            q[i * heads_per_group: (i + 1) * heads_per_group, :, :]
-        )
-        qkv_weights_l.append(k[i: i + 1, :, :])
-        qkv_weights_l.append(v[i: i + 1, :, :])
+        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
+        qkv_weights_l.append(k[i : i + 1, :, :])
+        qkv_weights_l.append(v[i : i + 1, :, :])
     qkv_weights = torch.cat(qkv_weights_l)
     assert qkv_weights.ndim == 3, qkv_weights.shape
-    assert (
-            qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups
-    ), qkv_weights.shape
+    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
     assert qkv_weights.shape[1] == head_size, qkv_weights.shape
     assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
 
-    qkv_weights = qkv_weights.reshape(
-        [head_size * (head_num + 2 * num_query_groups), hidden_size]
-    )
+    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
 
     return qkv_weights
 
@@ -255,9 +239,9 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v):
 @io.state_transform(
     source_key="decoder.layers.*.self_attention.linear_qkv.weight",
     target_key=(
-            "model.layers.*.self_attn.q_proj.weight",
-            "model.layers.*.self_attn.k_proj.weight",
-            "model.layers.*.self_attn.v_proj.weight",
+        "model.layers.*.self_attn.q_proj.weight",
+        "model.layers.*.self_attn.k_proj.weight",
+        "model.layers.*.self_attn.v_proj.weight",
     ),
 )
 def _export_qkv(ctx: io.TransformCTX, linear_qkv):
@@ -289,11 +273,8 @@ def _export_qkv(ctx: io.TransformCTX, linear_qkv):
 
 
 @io.state_transform(
-    source_key=(
-            "model.layers.*.mlp.gate_proj.weight",
-            "model.layers.*.mlp.up_proj.weight"
-    ),
-    target_key="decoder.layers.*.mlp.linear_fc1.weight"
+    source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
+    target_key="decoder.layers.*.mlp.linear_fc1.weight",
 )
 def _import_linear_fc1(down, gate):
     return torch.cat((down, gate), axis=0).float()
@@ -301,10 +282,7 @@ def _import_linear_fc1(down, gate):
 
 @io.state_transform(
     source_key="decoder.layers.*.mlp.linear_fc1.weight",
-    target_key=(
-            "model.layers.*.mlp.gate_proj.weight",
-            "model.layers.*.mlp.up_proj.weight"
-    ),
+    target_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
 )
 def _export_linear_fc1(linear_fc1):
     gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
@@ -319,4 +297,4 @@ def _export_linear_fc1(linear_fc1):
     "CodeGemmaConfig2B",
     "CodeGemmaConfig7B",
     "GemmaModel",
-]
\ No newline at end of file
+]

From aed0b7c0b7403ee2d83ef8f47840be9c3be43c8e Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Mon, 24 Jun 2024 16:21:59 -0700
Subject: [PATCH 12/13] checks

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 nemo/collections/llm/gpt/model/gemma.py | 65 ++++++++++++++++---------
 nemo/collections/llm/gpt/model/llama.py |  2 +-
 2 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py
index f933676c0c63..b45099996327 100644
--- a/nemo/collections/llm/gpt/model/gemma.py
+++ b/nemo/collections/llm/gpt/model/gemma.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Annotated, Callable, Generic, Optional, Type, TypeVar
+from typing import TYPE_CHECKING, Annotated, Callable, Optional
 
 import torch
 
@@ -8,7 +8,6 @@
 from nemo.collections.llm.utils import Config
 from nemo.collections.nlp.modules.common.megatron.utils import openai_gelu
 from nemo.lightning import OptimizerModule, io, teardown
-from nemo.lightning.io.connector import TargetT
 
 if TYPE_CHECKING:
     from transformers import GemmaForCausalLM
@@ -72,7 +71,6 @@ def __init__(
     ):
         super().__init__(config or GemmaConfig(), optim=optim, tokenizer=tokenizer)
 
-
 @io.model_importer(GemmaModel, "hf")
 class HFGemmaImporter(io.ModelConnector["GemmaForCausalLM", GemmaModel]):
     def init(self) -> GemmaModel:
@@ -104,7 +102,12 @@ def convert_state(self, source, target):
             "model.norm.weight": "decoder.final_layernorm.weight",
         }
 
-        return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1])
+        return io.apply_transforms(
+            source,
+            target,
+            mapping=mapping,
+            transforms=[_import_qkv, _import_linear_fc1]
+        )
 
     @property
     def tokenizer(self) -> "AutoTokenizer":
@@ -115,7 +118,6 @@ def tokenizer(self) -> "AutoTokenizer":
     @property
     def config(self) -> GemmaConfig:
         from transformers import GemmaConfig as HFGemmaConfig
-
         source = HFGemmaConfig.from_pretrained(str(self))
 
         def make_vocab_size_divisible_by(vocab_size):
@@ -169,7 +171,12 @@ def convert_state(self, source, target):
             "decoder.final_layernorm.weight": "model.norm.weight",
         }
 
-        return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1])
+        return io.apply_transforms(
+            source,
+            target,
+            mapping=mapping,
+            transforms=[_export_qkv, _export_linear_fc1]
+        )
 
     @property
     def tokenizer(self):
@@ -190,17 +197,17 @@ def config(self) -> "GemmaConfig":
             initializer_range=source.init_method_std,
             rms_norm_eps=source.layernorm_epsilon,
             num_key_value_heads=source.num_query_groups,
-            vocab_size=self.tokenizer.vocab_size,
+            vocab_size=self.tokenizer.vocab_size
         )
 
 
 @io.state_transform(
     source_key=(
-        "model.layers.*.self_attn.q_proj.weight",
-        "model.layers.*.self_attn.k_proj.weight",
-        "model.layers.*.self_attn.v_proj.weight",
+            "model.layers.*.self_attn.q_proj.weight",
+            "model.layers.*.self_attn.k_proj.weight",
+            "model.layers.*.self_attn.v_proj.weight",
     ),
-    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+    target_key="decoder.layers.*.self_attention.linear_qkv.weight"
 )
 def _import_qkv(ctx: io.TransformCTX, q, k, v):
     megatron_config = ctx.target.config
@@ -222,16 +229,22 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v):
 
     qkv_weights_l = []
     for i in range(num_query_groups):
-        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
-        qkv_weights_l.append(k[i : i + 1, :, :])
-        qkv_weights_l.append(v[i : i + 1, :, :])
+        qkv_weights_l.append(
+            q[i * heads_per_group: (i + 1) * heads_per_group, :, :]
+        )
+        qkv_weights_l.append(k[i: i + 1, :, :])
+        qkv_weights_l.append(v[i: i + 1, :, :])
     qkv_weights = torch.cat(qkv_weights_l)
     assert qkv_weights.ndim == 3, qkv_weights.shape
-    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
+    assert (
+            qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups
+    ), qkv_weights.shape
     assert qkv_weights.shape[1] == head_size, qkv_weights.shape
     assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
 
-    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+    qkv_weights = qkv_weights.reshape(
+        [head_size * (head_num + 2 * num_query_groups), hidden_size]
+    )
 
     return qkv_weights
 
@@ -239,9 +252,9 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v):
 @io.state_transform(
     source_key="decoder.layers.*.self_attention.linear_qkv.weight",
     target_key=(
-        "model.layers.*.self_attn.q_proj.weight",
-        "model.layers.*.self_attn.k_proj.weight",
-        "model.layers.*.self_attn.v_proj.weight",
+            "model.layers.*.self_attn.q_proj.weight",
+            "model.layers.*.self_attn.k_proj.weight",
+            "model.layers.*.self_attn.v_proj.weight",
     ),
 )
 def _export_qkv(ctx: io.TransformCTX, linear_qkv):
@@ -273,8 +286,11 @@ def _export_qkv(ctx: io.TransformCTX, linear_qkv):
 
 
 @io.state_transform(
-    source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
-    target_key="decoder.layers.*.mlp.linear_fc1.weight",
+    source_key=(
+            "model.layers.*.mlp.gate_proj.weight",
+            "model.layers.*.mlp.up_proj.weight"
+    ),
+    target_key="decoder.layers.*.mlp.linear_fc1.weight"
 )
 def _import_linear_fc1(down, gate):
     return torch.cat((down, gate), axis=0).float()
@@ -282,7 +298,10 @@ def _import_linear_fc1(down, gate):
 
 @io.state_transform(
     source_key="decoder.layers.*.mlp.linear_fc1.weight",
-    target_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
+    target_key=(
+            "model.layers.*.mlp.gate_proj.weight",
+            "model.layers.*.mlp.up_proj.weight"
+    ),
 )
 def _export_linear_fc1(linear_fc1):
     gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
@@ -297,4 +316,4 @@ def _export_linear_fc1(linear_fc1):
     "CodeGemmaConfig2B",
     "CodeGemmaConfig7B",
     "GemmaModel",
-]
+]
\ No newline at end of file
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index 210101e1b206..aa089b077041 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Annotated, Callable, Optional, Type
+from typing import TYPE_CHECKING, Annotated, Callable, Optional
 
 import torch
 import torch.nn.functional as F

From 189764d856c72b1d101d91792d58b55d9ea10e75 Mon Sep 17 00:00:00 2001
From: cuichenx <cuichenx@users.noreply.github.com>
Date: Mon, 24 Jun 2024 23:22:36 +0000
Subject: [PATCH 13/13] Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
---
 nemo/collections/llm/gpt/model/gemma.py | 62 +++++++++----------------
 1 file changed, 21 insertions(+), 41 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py
index b45099996327..ff9772b1b74c 100644
--- a/nemo/collections/llm/gpt/model/gemma.py
+++ b/nemo/collections/llm/gpt/model/gemma.py
@@ -71,6 +71,7 @@ def __init__(
     ):
         super().__init__(config or GemmaConfig(), optim=optim, tokenizer=tokenizer)
 
+
 @io.model_importer(GemmaModel, "hf")
 class HFGemmaImporter(io.ModelConnector["GemmaForCausalLM", GemmaModel]):
     def init(self) -> GemmaModel:
@@ -102,12 +103,7 @@ def convert_state(self, source, target):
             "model.norm.weight": "decoder.final_layernorm.weight",
         }
 
-        return io.apply_transforms(
-            source,
-            target,
-            mapping=mapping,
-            transforms=[_import_qkv, _import_linear_fc1]
-        )
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1])
 
     @property
     def tokenizer(self) -> "AutoTokenizer":
@@ -118,6 +114,7 @@ def tokenizer(self) -> "AutoTokenizer":
     @property
     def config(self) -> GemmaConfig:
         from transformers import GemmaConfig as HFGemmaConfig
+
         source = HFGemmaConfig.from_pretrained(str(self))
 
         def make_vocab_size_divisible_by(vocab_size):
@@ -171,12 +168,7 @@ def convert_state(self, source, target):
             "decoder.final_layernorm.weight": "model.norm.weight",
         }
 
-        return io.apply_transforms(
-            source,
-            target,
-            mapping=mapping,
-            transforms=[_export_qkv, _export_linear_fc1]
-        )
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1])
 
     @property
     def tokenizer(self):
@@ -197,17 +189,17 @@ def config(self) -> "GemmaConfig":
             initializer_range=source.init_method_std,
             rms_norm_eps=source.layernorm_epsilon,
             num_key_value_heads=source.num_query_groups,
-            vocab_size=self.tokenizer.vocab_size
+            vocab_size=self.tokenizer.vocab_size,
         )
 
 
 @io.state_transform(
     source_key=(
-            "model.layers.*.self_attn.q_proj.weight",
-            "model.layers.*.self_attn.k_proj.weight",
-            "model.layers.*.self_attn.v_proj.weight",
+        "model.layers.*.self_attn.q_proj.weight",
+        "model.layers.*.self_attn.k_proj.weight",
+        "model.layers.*.self_attn.v_proj.weight",
     ),
-    target_key="decoder.layers.*.self_attention.linear_qkv.weight"
+    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
 )
 def _import_qkv(ctx: io.TransformCTX, q, k, v):
     megatron_config = ctx.target.config
@@ -229,22 +221,16 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v):
 
     qkv_weights_l = []
     for i in range(num_query_groups):
-        qkv_weights_l.append(
-            q[i * heads_per_group: (i + 1) * heads_per_group, :, :]
-        )
-        qkv_weights_l.append(k[i: i + 1, :, :])
-        qkv_weights_l.append(v[i: i + 1, :, :])
+        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
+        qkv_weights_l.append(k[i : i + 1, :, :])
+        qkv_weights_l.append(v[i : i + 1, :, :])
     qkv_weights = torch.cat(qkv_weights_l)
     assert qkv_weights.ndim == 3, qkv_weights.shape
-    assert (
-            qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups
-    ), qkv_weights.shape
+    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
     assert qkv_weights.shape[1] == head_size, qkv_weights.shape
     assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
 
-    qkv_weights = qkv_weights.reshape(
-        [head_size * (head_num + 2 * num_query_groups), hidden_size]
-    )
+    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
 
     return qkv_weights
 
@@ -252,9 +238,9 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v):
 @io.state_transform(
     source_key="decoder.layers.*.self_attention.linear_qkv.weight",
     target_key=(
-            "model.layers.*.self_attn.q_proj.weight",
-            "model.layers.*.self_attn.k_proj.weight",
-            "model.layers.*.self_attn.v_proj.weight",
+        "model.layers.*.self_attn.q_proj.weight",
+        "model.layers.*.self_attn.k_proj.weight",
+        "model.layers.*.self_attn.v_proj.weight",
     ),
 )
 def _export_qkv(ctx: io.TransformCTX, linear_qkv):
@@ -286,11 +272,8 @@ def _export_qkv(ctx: io.TransformCTX, linear_qkv):
 
 
 @io.state_transform(
-    source_key=(
-            "model.layers.*.mlp.gate_proj.weight",
-            "model.layers.*.mlp.up_proj.weight"
-    ),
-    target_key="decoder.layers.*.mlp.linear_fc1.weight"
+    source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
+    target_key="decoder.layers.*.mlp.linear_fc1.weight",
 )
 def _import_linear_fc1(down, gate):
     return torch.cat((down, gate), axis=0).float()
@@ -298,10 +281,7 @@ def _import_linear_fc1(down, gate):
 
 @io.state_transform(
     source_key="decoder.layers.*.mlp.linear_fc1.weight",
-    target_key=(
-            "model.layers.*.mlp.gate_proj.weight",
-            "model.layers.*.mlp.up_proj.weight"
-    ),
+    target_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
 )
 def _export_linear_fc1(linear_fc1):
     gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
@@ -316,4 +296,4 @@ def _export_linear_fc1(linear_fc1):
     "CodeGemmaConfig2B",
     "CodeGemmaConfig7B",
     "GemmaModel",
-]
\ No newline at end of file
+]