From bf8178ee030a1eedd0456d2ffa3e2de4ec8ace63 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Mon, 24 Jun 2024 14:11:42 -0700 Subject: [PATCH 01/13] add llama Signed-off-by: Chen Cui --- nemo/collections/llm/__init__.py | 18 ++++++++++++++++++ nemo/collections/llm/gpt/model/__init__.py | 12 ++++++++++++ nemo/collections/llm/gpt/model/mistral_7b.py | 3 --- nemo/lightning/io/connector.py | 3 ++- 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index cb8db0f5f272..2dcb778c5aef 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -20,6 +20,15 @@ Mistral7BModel, MixtralConfig, MixtralModel, + LlamaConfig, + Llama2Config7B, + Llama2Config13B, + Llama2Config70B, + CodeLlamaConfig7B, + CodeLlamaConfig13B, + CodeLlamaConfig34B, + CodeLlamaConfig70B, + LlamaModel, gpt_data_step, gpt_forward_step, ) @@ -35,6 +44,15 @@ "Mistral7BModel", "MixtralConfig", "MixtralModel", + "LlamaConfig", + "Llama2Config7B", + "Llama2Config13B", + "Llama2Config70B", + "CodeLlamaConfig7B", + "CodeLlamaConfig13B", + "CodeLlamaConfig34B", + "CodeLlamaConfig70B", + "LlamaModel", "PreTrainingDataModule", "FineTuningDataModule", "SquadDataModule", diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index 0ddaa61c7a35..b1726e89f2a6 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -7,6 +7,8 @@ ) from nemo.collections.llm.gpt.model.mistral_7b import Mistral7BConfig, Mistral7BModel from nemo.collections.llm.gpt.model.mixtral import MixtralConfig, MixtralModel +from nemo.collections.llm.gpt.model.llama import * + __all__ = [ "GPTConfig", @@ -15,7 +17,17 @@ "Mistral7BModel", "MixtralConfig", "MixtralModel", + "LlamaConfig", + "Llama2Config7B", + "Llama2Config13B", + "Llama2Config70B", + "CodeLlamaConfig7B", + "CodeLlamaConfig13B", + "CodeLlamaConfig34B", + "CodeLlamaConfig70B", + "LlamaModel", "MaskedTokenLossReduction", "gpt_data_step", "gpt_forward_step", ] + diff --git a/nemo/collections/llm/gpt/model/mistral_7b.py b/nemo/collections/llm/gpt/model/mistral_7b.py index ada67c17da25..ff9591581f86 100644 --- a/nemo/collections/llm/gpt/model/mistral_7b.py +++ b/nemo/collections/llm/gpt/model/mistral_7b.py @@ -71,9 +71,6 @@ def apply(self, output_path: Path) -> Path: return output_path - def on_import_ckpt(self, model: pl.LightningModule): - model.tokenizer = self.tokenizer - def convert_state(self, source, target): mapping = { "model.embed_tokens.weight": "embedding.word_embeddings.weight", diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py index a6ab4afd6d1b..b813b6198477 100644 --- a/nemo/lightning/io/connector.py +++ b/nemo/lightning/io/connector.py @@ -217,4 +217,5 @@ def local_path(self, base_path: Optional[Path] = None) -> Path: return _base / str(self).replace("://", "/") - def on_import_ckpt(self, model: pl.LightningModule): ... + def on_import_ckpt(self, model: pl.LightningModule): + model.tokenizer = self.tokenizer \ No newline at end of file From 2b5bce408367a9a1af96079edf7ef7c9ab6276a4 Mon Sep 17 00:00:00 2001 From: cuichenx Date: Mon, 24 Jun 2024 21:12:59 +0000 Subject: [PATCH 02/13] Apply isort and black reformatting Signed-off-by: cuichenx --- nemo/collections/llm/__init__.py | 18 +++++++++--------- nemo/collections/llm/gpt/model/__init__.py | 4 +--- nemo/lightning/io/connector.py | 2 +- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index 2dcb778c5aef..ff34a10502f0 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -13,22 +13,22 @@ SquadDataModule, ) from nemo.collections.llm.gpt.model import ( + CodeLlamaConfig7B, + CodeLlamaConfig13B, + CodeLlamaConfig34B, + CodeLlamaConfig70B, GPTConfig, GPTModel, + Llama2Config7B, + Llama2Config13B, + Llama2Config70B, + LlamaConfig, + LlamaModel, MaskedTokenLossReduction, Mistral7BConfig, Mistral7BModel, MixtralConfig, MixtralModel, - LlamaConfig, - Llama2Config7B, - Llama2Config13B, - Llama2Config70B, - CodeLlamaConfig7B, - CodeLlamaConfig13B, - CodeLlamaConfig34B, - CodeLlamaConfig70B, - LlamaModel, gpt_data_step, gpt_forward_step, ) diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index b1726e89f2a6..6a2f075cbd4f 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -5,10 +5,9 @@ gpt_data_step, gpt_forward_step, ) +from nemo.collections.llm.gpt.model.llama import * from nemo.collections.llm.gpt.model.mistral_7b import Mistral7BConfig, Mistral7BModel from nemo.collections.llm.gpt.model.mixtral import MixtralConfig, MixtralModel -from nemo.collections.llm.gpt.model.llama import * - __all__ = [ "GPTConfig", @@ -30,4 +29,3 @@ "gpt_data_step", "gpt_forward_step", ] - diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py index b813b6198477..41c81582bb63 100644 --- a/nemo/lightning/io/connector.py +++ b/nemo/lightning/io/connector.py @@ -218,4 +218,4 @@ def local_path(self, base_path: Optional[Path] = None) -> Path: return _base / str(self).replace("://", "/") def on_import_ckpt(self, model: pl.LightningModule): - model.tokenizer = self.tokenizer \ No newline at end of file + model.tokenizer = self.tokenizer From c56f3bb9d84fe857139fa801ada3797bd433b0f6 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Mon, 24 Jun 2024 14:18:35 -0700 Subject: [PATCH 03/13] add llama Signed-off-by: Chen Cui --- nemo/collections/llm/gpt/model/llama.py | 351 ++++++++++++++++++++++++ 1 file changed, 351 insertions(+) create mode 100644 nemo/collections/llm/gpt/model/llama.py diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py new file mode 100644 index 000000000000..218c318d1290 --- /dev/null +++ b/nemo/collections/llm/gpt/model/llama.py @@ -0,0 +1,351 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING, Callable, Type, Optional, Annotated + +import torch +import torch.nn.functional as F + +from nemo.lightning import io, OptimizerModule, teardown +from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel +from nemo.collections.llm.utils import Config + +if TYPE_CHECKING: + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer + from transformers import ( + LlamaConfig as HFLlamaConfig, + LlamaForCausalLM, + ) + from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec + + +# Note: these Llama configs are copied from the corresponding HF model. You may need to modify the parameter for +# your own needs, in particular: seq_length and rotary_base. +@dataclass +class LlamaConfig(GPTConfig): + # configs that are common across model sizes + normalization: str = "RMSNorm" + activation_func: Callable = F.silu + gated_linear_unit: bool = True + position_embedding_type: str = "rope" + add_bias_linear: bool = False + seq_length: int = 4096 + + +@dataclass +class Llama2Config7B(LlamaConfig): + num_layers: int = 32 + hidden_size: int = 4096 + num_attention_heads: int = 32 + num_query_groups: int = 32 + fnn_hidden_size: int = 11008 + + +@dataclass +class Llama2Config13B(LlamaConfig): + num_layers: int = 40 + hidden_size: int = 5120 + num_attention_heads: int = 40 + num_query_groups: int = 40 + ffn_hidden_size: int = 13824 + + +@dataclass +class Llama2Config70B(LlamaConfig): + num_layers: int = 80 + hidden_size: int = 8192 + num_attention_heads: int = 64 + num_query_groups: int = 8 + ffn_hidden_size: int = 28672 + +@dataclass +class CodeLlamaConfig7B(Llama2Config7B): + rotary_base: int = 1_000_000 + seq_length: int = 16384 + + +@dataclass +class CodeLlamaConfig13B(Llama2Config13B): + rotary_base: int = 1_000_000 + seq_length: int = 16384 + + +@dataclass +class CodeLlamaConfig34B(LlamaConfig): + num_layers: int = 48 + hidden_size: int = 8192 + num_attention_heads: int = 64 + num_query_groups: int = 8 + ffn_hidden_size: int = 22016 + rotary_base: int = 1_000_000 + seq_length: int = 16384 + + +@dataclass +class CodeLlamaConfig70B(Llama2Config70B): + pass + + +class LlamaModel(GPTModel): + def __init__( + self, + config: Annotated[Optional[LlamaConfig], Config[LlamaConfig]] = None, + optim: Optional[OptimizerModule] = None, + tokenizer: Optional["TokenizerSpec"] = None, + ): + super().__init__(config or LlamaConfig(), optim=optim, tokenizer=tokenizer) + + +@io.model_importer(LlamaModel, "hf") +class HFLlamaImporter(io.ModelConnector["LlamaForCausalLM", LlamaModel]): + def init(self) -> LlamaModel: + return LlamaModel(self.config, tokenizer=self.tokenizer) + + def apply(self, output_path: Path) -> Path: + from transformers import LlamaForCausalLM + + source = LlamaForCausalLM.from_pretrained(str(self)) + target = self.init() + trainer = self.nemo_setup(target) + self.convert_state(source, target) + self.nemo_save(output_path, trainer) + + print(f"Converted Llama model to Nemo, model saved to {output_path}") + + teardown(trainer, target) + del trainer, target + + return output_path + def convert_state(self, source, target): + mapping = { + "model.embed_tokens.weight": "embedding.word_embeddings.weight", + "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight", + "model.layers.*.mlp.down_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight", + "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight", + "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight", + "model.norm.weight": "decoder.final_layernorm.weight", + "lm_head.weight": "output_layer.weight" + } + + return io.apply_transforms( + source, + target, + mapping=mapping, + transforms=[_import_qkv, _import_linear_fc1] + ) + + @property + def tokenizer(self) -> "AutoTokenizer": + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer + return AutoTokenizer(str(self)) + + @property + def config(self) -> LlamaConfig: + from transformers import LlamaConfig as HFLlamaConfig + + source = HFLlamaConfig.from_pretrained(str(self)) + + def make_vocab_size_divisible_by(vocab_size): + base = 128 + while vocab_size % base != 0: + base //= 2 + return base + + output = HFLlamaConfig( + seq_length=source.sliding_window, + num_layers=source.num_hidden_layers, + hidden_size=source.hidden_size, + ffn_hidden_size=source.intermediate_size, + num_attention_heads=source.num_attention_heads, + init_method_std=source.initializer_range, + layernorm_epsilon=source.rms_norm_eps, + num_query_groups=source.num_key_value_heads, + rotary_base=source.rope_theta, + gated_linear_unit=True, + make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size), + window_size=[source.sliding_window, 0], + share_embeddings_and_output_weights=False, + ) + + return output + + +@io.model_exporter(LlamaModel, "hf") +class HFLlamaExporter(io.ModelConnector[LlamaModel, "LlamaForCausalLM"]): + def init(self) -> "LlamaForCausalLM": + from transformers import AutoModelForCausalLM + + return AutoModelForCausalLM.from_config(self.config) + + def apply(self, output_path: Path) -> Path: + target = self.init() + source, _ = self.nemo_load(str(self)) + target = self.convert_state(source, target) + + target = target.cpu() + target.save_pretrained(output_path) + self.tokenizer.save_pretrained(output_path) + + return output_path + + def convert_state(self, source, target): + mapping = { + "embedding.word_embeddings.weight": "model.embed_tokens.weight", + "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight", + "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight", + "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight", + "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight", + "decoder.final_layernorm.weight": "model.norm.weight", + "output_layer.weight": "lm_head.weight" + } + + return io.apply_transforms( + source, + target, + mapping=mapping, + transforms=[_export_qkv, _export_linear_fc1] + ) + + @property + def tokenizer(self): + return io.load_ckpt(str(self)).model.tokenizer.tokenizer + + @property + def config(self) -> "HFLlamaConfig": + source: LlamaConfig = io.load_ckpt(str(self)).model.config + + from transformers import LlamaConfig as HFLlamaConfig + + return HFLlamaConfig( + sliding_window=source.window_size[0], + num_hidden_layers=source.num_layers, + hidden_size=source.hidden_size, + intermediate_size=source.ffn_hidden_size, + num_attention_heads=source.num_attention_heads, + max_position_embeddings=source.seq_length, + initializer_range=source.init_method_std, + rms_norm_eps=source.layernorm_epsilon, + num_key_value_heads=source.num_query_groups, + rope_theta=source.rotary_base, + vocab_size=self.tokenizer.vocab_size, + ) + + +@io.state_transform( + source_key=( + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", + ), + target_key="decoder.layers.*.self_attention.linear_qkv.weight" +) +def _import_qkv(ctx: io.TransformCTX, q, k, v): + megatron_config = ctx.target.config + + head_num = megatron_config.num_attention_heads + num_query_groups = megatron_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = megatron_config.hidden_size + head_num = megatron_config.num_attention_heads + head_size = hidden_size // head_num + + old_tensor_shape = q.size() + new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:] + new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:] + + q = q.view(*new_q_tensor_shape) + k = k.view(*new_kv_tensor_shape) + v = v.view(*new_kv_tensor_shape) + + qkv_weights_l = [] + for i in range(num_query_groups): + qkv_weights_l.append( + q[i * heads_per_group: (i + 1) * heads_per_group, :, :] + ) + qkv_weights_l.append(k[i: i + 1, :, :]) + qkv_weights_l.append(v[i: i + 1, :, :]) + qkv_weights = torch.cat(qkv_weights_l) + assert qkv_weights.ndim == 3, qkv_weights.shape + assert ( + qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups + ), qkv_weights.shape + assert qkv_weights.shape[1] == head_size, qkv_weights.shape + assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape + + qkv_weights = qkv_weights.reshape( + [head_size * (head_num + 2 * num_query_groups), hidden_size] + ) + + return qkv_weights + + +@io.state_transform( + source_key="decoder.layers.*.self_attention.linear_qkv.weight", + target_key=( + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", + ), +) +def _export_qkv(ctx: io.TransformCTX, linear_qkv): + megatron_config = ctx.source.config + + head_num = megatron_config.num_attention_heads + num_query_groups = megatron_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = megatron_config.hidden_size + head_num = megatron_config.num_attention_heads + head_size = hidden_size // head_num + qkv_total_dim = head_num + 2 * num_query_groups + + linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size]) + q_slice = torch.cat( + [ + torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group) + for i in range(num_query_groups) + ] + ) + k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2)) + v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2)) + + q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu() + k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu() + v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu() + + return q_proj, k_proj, v_proj + + +@io.state_transform( + source_key=( + "model.layers.*.mlp.gate_proj.weight", + "model.layers.*.mlp.up_proj.weight" + ), + target_key="decoder.layers.*.mlp.linear_fc1.weight" +) +def _import_linear_fc1(down, gate): + return torch.cat((down, gate), axis=0).float() + + +@io.state_transform( + source_key="decoder.layers.*.mlp.linear_fc1.weight", + target_key=( + "model.layers.*.mlp.gate_proj.weight", + "model.layers.*.mlp.up_proj.weight" + ), +) +def _export_linear_fc1(linear_fc1): + gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0) + + return gate_proj, up_proj + + +__all__ = [ + "LlamaConfig", + "Llama2Config7B", + "Llama2Config13B", + "Llama2Config70B", + "CodeLlamaConfig7B", + "CodeLlamaConfig13B", + "CodeLlamaConfig34B", + "CodeLlamaConfig70B", + "LlamaModel", +] From a74fa0787593b737b509d4cc35eb707e18da9c61 Mon Sep 17 00:00:00 2001 From: cuichenx Date: Mon, 24 Jun 2024 21:19:15 +0000 Subject: [PATCH 04/13] Apply isort and black reformatting Signed-off-by: cuichenx --- nemo/collections/llm/gpt/model/llama.py | 74 +++++++++---------------- 1 file changed, 27 insertions(+), 47 deletions(-) diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py index 218c318d1290..16158bbc96a4 100644 --- a/nemo/collections/llm/gpt/model/llama.py +++ b/nemo/collections/llm/gpt/model/llama.py @@ -1,20 +1,19 @@ from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Callable, Type, Optional, Annotated +from typing import TYPE_CHECKING, Annotated, Callable, Optional, Type import torch import torch.nn.functional as F -from nemo.lightning import io, OptimizerModule, teardown from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel from nemo.collections.llm.utils import Config +from nemo.lightning import OptimizerModule, io, teardown if TYPE_CHECKING: + from transformers import LlamaConfig as HFLlamaConfig + from transformers import LlamaForCausalLM + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - from transformers import ( - LlamaConfig as HFLlamaConfig, - LlamaForCausalLM, - ) from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec @@ -57,6 +56,7 @@ class Llama2Config70B(LlamaConfig): num_query_groups: int = 8 ffn_hidden_size: int = 28672 + @dataclass class CodeLlamaConfig7B(Llama2Config7B): rotary_base: int = 1_000_000 @@ -115,6 +115,7 @@ def apply(self, output_path: Path) -> Path: del trainer, target return output_path + def convert_state(self, source, target): mapping = { "model.embed_tokens.weight": "embedding.word_embeddings.weight", @@ -123,19 +124,15 @@ def convert_state(self, source, target): "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight", "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight", "model.norm.weight": "decoder.final_layernorm.weight", - "lm_head.weight": "output_layer.weight" + "lm_head.weight": "output_layer.weight", } - return io.apply_transforms( - source, - target, - mapping=mapping, - transforms=[_import_qkv, _import_linear_fc1] - ) + return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1]) @property def tokenizer(self) -> "AutoTokenizer": from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer + return AutoTokenizer(str(self)) @property @@ -195,15 +192,10 @@ def convert_state(self, source, target): "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight", "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight", "decoder.final_layernorm.weight": "model.norm.weight", - "output_layer.weight": "lm_head.weight" + "output_layer.weight": "lm_head.weight", } - return io.apply_transforms( - source, - target, - mapping=mapping, - transforms=[_export_qkv, _export_linear_fc1] - ) + return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1]) @property def tokenizer(self): @@ -232,11 +224,11 @@ def config(self) -> "HFLlamaConfig": @io.state_transform( source_key=( - "model.layers.*.self_attn.q_proj.weight", - "model.layers.*.self_attn.k_proj.weight", - "model.layers.*.self_attn.v_proj.weight", + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", ), - target_key="decoder.layers.*.self_attention.linear_qkv.weight" + target_key="decoder.layers.*.self_attention.linear_qkv.weight", ) def _import_qkv(ctx: io.TransformCTX, q, k, v): megatron_config = ctx.target.config @@ -258,22 +250,16 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v): qkv_weights_l = [] for i in range(num_query_groups): - qkv_weights_l.append( - q[i * heads_per_group: (i + 1) * heads_per_group, :, :] - ) - qkv_weights_l.append(k[i: i + 1, :, :]) - qkv_weights_l.append(v[i: i + 1, :, :]) + qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :]) + qkv_weights_l.append(k[i : i + 1, :, :]) + qkv_weights_l.append(v[i : i + 1, :, :]) qkv_weights = torch.cat(qkv_weights_l) assert qkv_weights.ndim == 3, qkv_weights.shape - assert ( - qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups - ), qkv_weights.shape + assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape assert qkv_weights.shape[1] == head_size, qkv_weights.shape assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape - qkv_weights = qkv_weights.reshape( - [head_size * (head_num + 2 * num_query_groups), hidden_size] - ) + qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size]) return qkv_weights @@ -281,9 +267,9 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v): @io.state_transform( source_key="decoder.layers.*.self_attention.linear_qkv.weight", target_key=( - "model.layers.*.self_attn.q_proj.weight", - "model.layers.*.self_attn.k_proj.weight", - "model.layers.*.self_attn.v_proj.weight", + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", ), ) def _export_qkv(ctx: io.TransformCTX, linear_qkv): @@ -315,11 +301,8 @@ def _export_qkv(ctx: io.TransformCTX, linear_qkv): @io.state_transform( - source_key=( - "model.layers.*.mlp.gate_proj.weight", - "model.layers.*.mlp.up_proj.weight" - ), - target_key="decoder.layers.*.mlp.linear_fc1.weight" + source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"), + target_key="decoder.layers.*.mlp.linear_fc1.weight", ) def _import_linear_fc1(down, gate): return torch.cat((down, gate), axis=0).float() @@ -327,10 +310,7 @@ def _import_linear_fc1(down, gate): @io.state_transform( source_key="decoder.layers.*.mlp.linear_fc1.weight", - target_key=( - "model.layers.*.mlp.gate_proj.weight", - "model.layers.*.mlp.up_proj.weight" - ), + target_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"), ) def _export_linear_fc1(linear_fc1): gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0) From 5acc3c69e0fd0355b752456c29c70fc313f91563 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Mon, 24 Jun 2024 14:28:19 -0700 Subject: [PATCH 05/13] add llama3 Signed-off-by: Chen Cui --- nemo/collections/llm/__init__.py | 4 ++++ nemo/collections/llm/gpt/model/__init__.py | 2 ++ nemo/collections/llm/gpt/model/llama.py | 13 +++++++++++++ 3 files changed, 19 insertions(+) diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index ff34a10502f0..0f0807950480 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -22,6 +22,8 @@ Llama2Config7B, Llama2Config13B, Llama2Config70B, + Llama3Config8B, + Llama3Config70B, LlamaConfig, LlamaModel, MaskedTokenLossReduction, @@ -48,6 +50,8 @@ "Llama2Config7B", "Llama2Config13B", "Llama2Config70B", + "Llama3Config8B", + "Llama3Config70B", "CodeLlamaConfig7B", "CodeLlamaConfig13B", "CodeLlamaConfig34B", diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index 6a2f075cbd4f..671fb20c5263 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -20,6 +20,8 @@ "Llama2Config7B", "Llama2Config13B", "Llama2Config70B", + "Llama3Config8B", + "Llama3Config70B", "CodeLlamaConfig7B", "CodeLlamaConfig13B", "CodeLlamaConfig34B", diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py index 218c318d1290..fb69c69dc4a3 100644 --- a/nemo/collections/llm/gpt/model/llama.py +++ b/nemo/collections/llm/gpt/model/llama.py @@ -57,6 +57,17 @@ class Llama2Config70B(LlamaConfig): num_query_groups: int = 8 ffn_hidden_size: int = 28672 +@dataclass +class Llama3Config8B(Llama2Config7B): + seq_length: int = 8192 + num_query_groups: int = 8 + fnn_hidden_size: int = 14336 + +@dataclass +class Llama3Config70B(Llama2Config70B): + seq_length: int = 8192 + + @dataclass class CodeLlamaConfig7B(Llama2Config7B): rotary_base: int = 1_000_000 @@ -343,6 +354,8 @@ def _export_linear_fc1(linear_fc1): "Llama2Config7B", "Llama2Config13B", "Llama2Config70B", + "Llama3Config8B", + "Llama3Config70B", "CodeLlamaConfig7B", "CodeLlamaConfig13B", "CodeLlamaConfig34B", From 7b4fe07657e6c1a364d195c7dc3d9941cb542cb7 Mon Sep 17 00:00:00 2001 From: cuichenx Date: Mon, 24 Jun 2024 21:29:11 +0000 Subject: [PATCH 06/13] Apply isort and black reformatting Signed-off-by: cuichenx --- nemo/collections/llm/gpt/model/llama.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py index 824e70fc6468..251c9e36dbc0 100644 --- a/nemo/collections/llm/gpt/model/llama.py +++ b/nemo/collections/llm/gpt/model/llama.py @@ -63,6 +63,7 @@ class Llama3Config8B(Llama2Config7B): num_query_groups: int = 8 fnn_hidden_size: int = 14336 + @dataclass class Llama3Config70B(Llama2Config70B): seq_length: int = 8192 From 5118b63011a3b8ebeacca9b5c5a8e0ec4bd03ccd Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Mon, 24 Jun 2024 14:52:47 -0700 Subject: [PATCH 07/13] fix typo Signed-off-by: Chen Cui --- nemo/collections/llm/gpt/model/llama.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py index 824e70fc6468..4211c57fd519 100644 --- a/nemo/collections/llm/gpt/model/llama.py +++ b/nemo/collections/llm/gpt/model/llama.py @@ -36,7 +36,7 @@ class Llama2Config7B(LlamaConfig): hidden_size: int = 4096 num_attention_heads: int = 32 num_query_groups: int = 32 - fnn_hidden_size: int = 11008 + ffn_hidden_size: int = 11008 @dataclass @@ -61,7 +61,7 @@ class Llama2Config70B(LlamaConfig): class Llama3Config8B(Llama2Config7B): seq_length: int = 8192 num_query_groups: int = 8 - fnn_hidden_size: int = 14336 + ffn_hidden_size: int = 14336 @dataclass class Llama3Config70B(Llama2Config70B): @@ -158,8 +158,7 @@ def make_vocab_size_divisible_by(vocab_size): base //= 2 return base - output = HFLlamaConfig( - seq_length=source.sliding_window, + output = LlamaConfig( num_layers=source.num_hidden_layers, hidden_size=source.hidden_size, ffn_hidden_size=source.intermediate_size, @@ -170,7 +169,6 @@ def make_vocab_size_divisible_by(vocab_size): rotary_base=source.rope_theta, gated_linear_unit=True, make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size), - window_size=[source.sliding_window, 0], share_embeddings_and_output_weights=False, ) @@ -219,7 +217,6 @@ def config(self) -> "HFLlamaConfig": from transformers import LlamaConfig as HFLlamaConfig return HFLlamaConfig( - sliding_window=source.window_size[0], num_hidden_layers=source.num_layers, hidden_size=source.hidden_size, intermediate_size=source.ffn_hidden_size, From 52321f41673aa350d9696924657485a24ae3b335 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Mon, 24 Jun 2024 14:53:05 -0700 Subject: [PATCH 08/13] enable importers with multiple models Signed-off-by: Chen Cui --- nemo/lightning/io/mixin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py index 62b9a165c542..b286341cc1cd 100644 --- a/nemo/lightning/io/mixin.py +++ b/nemo/lightning/io/mixin.py @@ -198,7 +198,7 @@ def register_importer(cls, ext: str, default_path: Optional[str] = None) -> Call """ def decorator(connector: Type[ConnT]) -> Type[ConnT]: - cls._IMPORTERS[ext] = connector + cls._IMPORTERS[str(cls)+ext] = connector if default_path: connector.default_path = default_path return connector @@ -221,7 +221,7 @@ def register_exporter(cls, ext: str, default_path: Optional[str] = None) -> Call """ def decorator(connector: Type[ConnT]) -> Type[ConnT]: - cls._EXPORTERS[ext] = connector + cls._EXPORTERS[str(cls)+ext] = connector if default_path: connector.default_path = default_path return connector @@ -310,7 +310,7 @@ def _get_connector(cls, ext, path=None, importer=True) -> ModelConnector: else: _path = path - connector = cls._IMPORTERS.get(ext) if importer else cls._EXPORTERS.get(ext) + connector = cls._IMPORTERS.get(str(cls)+ext) if importer else cls._EXPORTERS.get(str(cls)+ext) if not connector: raise ValueError(f"No connector found for extension '{ext}'") From 1765002bcff660a8804610891d2f6fb5e593a09e Mon Sep 17 00:00:00 2001 From: cuichenx Date: Mon, 24 Jun 2024 21:54:19 +0000 Subject: [PATCH 09/13] Apply isort and black reformatting Signed-off-by: cuichenx --- nemo/lightning/io/mixin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py index b286341cc1cd..54b6e7195bc9 100644 --- a/nemo/lightning/io/mixin.py +++ b/nemo/lightning/io/mixin.py @@ -198,7 +198,7 @@ def register_importer(cls, ext: str, default_path: Optional[str] = None) -> Call """ def decorator(connector: Type[ConnT]) -> Type[ConnT]: - cls._IMPORTERS[str(cls)+ext] = connector + cls._IMPORTERS[str(cls) + ext] = connector if default_path: connector.default_path = default_path return connector @@ -221,7 +221,7 @@ def register_exporter(cls, ext: str, default_path: Optional[str] = None) -> Call """ def decorator(connector: Type[ConnT]) -> Type[ConnT]: - cls._EXPORTERS[str(cls)+ext] = connector + cls._EXPORTERS[str(cls) + ext] = connector if default_path: connector.default_path = default_path return connector @@ -310,7 +310,7 @@ def _get_connector(cls, ext, path=None, importer=True) -> ModelConnector: else: _path = path - connector = cls._IMPORTERS.get(str(cls)+ext) if importer else cls._EXPORTERS.get(str(cls)+ext) + connector = cls._IMPORTERS.get(str(cls) + ext) if importer else cls._EXPORTERS.get(str(cls) + ext) if not connector: raise ValueError(f"No connector found for extension '{ext}'") From 9051f4d834b4f0b5c398953e52ac5b79fe0c81c0 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Mon, 24 Jun 2024 15:42:13 -0700 Subject: [PATCH 10/13] add gemma Signed-off-by: Chen Cui --- nemo/collections/llm/__init__.py | 12 + nemo/collections/llm/gpt/model/__init__.py | 7 + nemo/collections/llm/gpt/model/gemma.py | 322 +++++++++++++++++++++ 3 files changed, 341 insertions(+) create mode 100644 nemo/collections/llm/gpt/model/gemma.py diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index 0f0807950480..2fbac2304569 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -26,6 +26,12 @@ Llama3Config70B, LlamaConfig, LlamaModel, + GemmaConfig, + GemmaConfig2B, + GemmaConfig7B, + CodeGemmaConfig2B, + CodeGemmaConfig7B, + GemmaModel, MaskedTokenLossReduction, Mistral7BConfig, Mistral7BModel, @@ -57,6 +63,12 @@ "CodeLlamaConfig34B", "CodeLlamaConfig70B", "LlamaModel", + "GemmaConfig", + "GemmaConfig2B", + "GemmaConfig7B", + "CodeGemmaConfig2B", + "CodeGemmaConfig7B", + "GemmaModel", "PreTrainingDataModule", "FineTuningDataModule", "SquadDataModule", diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index 671fb20c5263..334ee05bd5df 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -6,6 +6,7 @@ gpt_forward_step, ) from nemo.collections.llm.gpt.model.llama import * +from nemo.collections.llm.gpt.model.gemma import * from nemo.collections.llm.gpt.model.mistral_7b import Mistral7BConfig, Mistral7BModel from nemo.collections.llm.gpt.model.mixtral import MixtralConfig, MixtralModel @@ -26,6 +27,12 @@ "CodeLlamaConfig13B", "CodeLlamaConfig34B", "CodeLlamaConfig70B", + "GemmaConfig", + "GemmaConfig2B", + "GemmaConfig7B", + "CodeGemmaConfig2B", + "CodeGemmaConfig7B", + "GemmaModel", "LlamaModel", "MaskedTokenLossReduction", "gpt_data_step", diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py new file mode 100644 index 000000000000..8e0c4f54278f --- /dev/null +++ b/nemo/collections/llm/gpt/model/gemma.py @@ -0,0 +1,322 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING, Callable, Generic, Optional, Type, TypeVar, Annotated + +import torch +from nemo.collections.nlp.modules.common.megatron.utils import openai_gelu + +from nemo.lightning import io, OptimizerModule, teardown +from nemo.lightning.io.connector import TargetT +from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel +from nemo.collections.llm.utils import Config + + +if TYPE_CHECKING: + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer + from transformers import ( + GemmaForCausalLM, + ) + from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec + + +# Note: Gemma requires huggingface transformers >= 4.38 +# Note: these Gemma configs are copied from the corresponding HF model. You may need to modify the parameter for +# your own needs, in particular: seq_length and rotary_base. +@dataclass +class GemmaConfig(GPTConfig): + # configs that are common across model sizes + normalization: str = "RMSNorm" + activation_func: Callable = openai_gelu + gated_linear_unit: bool = True + position_embedding_type: str = "rope" + add_bias_linear: bool = False + seq_length: int = 8192 + kv_channels: int = 256 + share_embeddings_and_output_weights: bool = True + # Note: different behavior compared to Legacy NeMo + # Legacy NeMo does not set layernorm_zero_centered_gamma and instead adds 1 in the HF -> NeMo conversion script + # The present implementation is more in line with the official implementation + layernorm_zero_centered_gamma: bool = True + + +@dataclass +class GemmaConfig2B(GemmaConfig): + num_layers: int = 18 + hidden_size: int = 2048 + num_attention_heads: int = 8 + num_query_groups: int = 1 + ffn_hidden_size: int = 16384 + + +@dataclass +class GemmaConfig7B(GemmaConfig): + num_layers: int = 28 + hidden_size: int = 3072 + num_attention_heads: int = 16 + num_query_groups: int = 16 + ffn_hidden_size: int = 24576 + + +class CodeGemmaConfig2B(GemmaConfig2B): + pass + + +class CodeGemmaConfig7B(GemmaConfig7B): + pass + + +class GemmaModel(GPTModel): + def __init__( + self, + config: Annotated[Optional[GemmaConfig], Config[GemmaConfig]] = None, + optim: Optional[OptimizerModule] = None, + tokenizer: Optional["TokenizerSpec"] = None, + ): + super().__init__(config or GemmaConfig(), optim=optim, tokenizer=tokenizer) + +@io.model_importer(GemmaModel, "hf") +class HFGemmaImporter(io.ModelConnector["GemmaForCausalLM", GemmaModel]): + def init(self) -> GemmaModel: + return GemmaModel(self.config, tokenizer=self.tokenizer) + + def apply(self, output_path: Path) -> Path: + from transformers import GemmaForCausalLM + + source = GemmaForCausalLM.from_pretrained(str(self)) + target = self.init() + trainer = self.nemo_setup(target) + self.convert_state(source, target) + self.nemo_save(output_path, trainer) + + print(f"Converted Gemma model to Nemo, model saved to {output_path}") + + teardown(trainer, target) + del trainer, target + + return output_path + + def convert_state(self, source, target): + mapping = { + "model.embed_tokens.weight": "embedding.word_embeddings.weight", + "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight", + "model.layers.*.mlp.down_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight", + "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight", + "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight", + "model.norm.weight": "decoder.final_layernorm.weight", + } + + return io.apply_transforms( + source, + target, + mapping=mapping, + transforms=[_import_qkv, _import_linear_fc1] + ) + + @property + def tokenizer(self) -> "AutoTokenizer": + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer + + return AutoTokenizer(str(self)) + + @property + def config(self) -> GemmaConfig: + from transformers import GemmaConfig as HFGemmaConfig + source = HFGemmaConfig.from_pretrained(str(self)) + + def make_vocab_size_divisible_by(vocab_size): + base = 128 + while vocab_size % base != 0: + base //= 2 + return base + + output = GemmaConfig( + num_layers=source.num_hidden_layers, + hidden_size=source.hidden_size, + ffn_hidden_size=source.intermediate_size, + num_attention_heads=source.num_attention_heads, + init_method_std=source.initializer_range, + layernorm_epsilon=source.rms_norm_eps, + num_query_groups=source.num_key_value_heads, + rotary_base=source.rope_theta, + gated_linear_unit=True, + make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size), + share_embeddings_and_output_weights=False, + ) + + return output + + +@io.model_exporter(GemmaModel, "hf") +class HFGemmaExporter(io.ModelConnector[GemmaModel, "GemmaForCausalLM"]): + def init(self) -> "GemmaForCausalLM": + from transformers import AutoModelForCausalLM + + return AutoModelForCausalLM.from_config(self.config) + + def apply(self, output_path: Path) -> Path: + target = self.init() + source, _ = self.nemo_load(str(self)) + target = self.convert_state(source, target) + + target = target.cpu() + target.save_pretrained(output_path) + self.tokenizer.save_pretrained(output_path) + + return output_path + + def convert_state(self, source, target): + mapping = { + "embedding.word_embeddings.weight": "model.embed_tokens.weight", + "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight", + "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight", + "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight", + "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight", + "decoder.final_layernorm.weight": "model.norm.weight", + } + + return io.apply_transforms( + source, + target, + mapping=mapping, + transforms=[_export_qkv, _export_linear_fc1] + ) + + @property + def tokenizer(self): + return io.load_ckpt(str(self)).model.tokenizer.tokenizer + + @property + def config(self) -> "GemmaConfig": + source: GemmaConfig = io.load_ckpt(str(self)).model.config + + from transformers import GemmaConfig as HFGemmaConfig + + return HFGemmaConfig( + num_hidden_layers=source.num_layers, + hidden_size=source.hidden_size, + intermediate_size=source.ffn_hidden_size, + num_attention_heads=source.num_attention_heads, + max_position_embeddings=source.seq_length, + initializer_range=source.init_method_std, + rms_norm_eps=source.layernorm_epsilon, + num_key_value_heads=source.num_query_groups, + vocab_size=self.tokenizer.vocab_size + ) + + +@io.state_transform( + source_key=( + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", + ), + target_key="decoder.layers.*.self_attention.linear_qkv.weight" +) +def _import_qkv(ctx: io.TransformCTX, q, k, v): + megatron_config = ctx.target.config + + head_num = megatron_config.num_attention_heads + num_query_groups = megatron_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = megatron_config.hidden_size + head_num = megatron_config.num_attention_heads + head_size = hidden_size // head_num + + old_tensor_shape = q.size() + new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:] + new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:] + + q = q.view(*new_q_tensor_shape) + k = k.view(*new_kv_tensor_shape) + v = v.view(*new_kv_tensor_shape) + + qkv_weights_l = [] + for i in range(num_query_groups): + qkv_weights_l.append( + q[i * heads_per_group: (i + 1) * heads_per_group, :, :] + ) + qkv_weights_l.append(k[i: i + 1, :, :]) + qkv_weights_l.append(v[i: i + 1, :, :]) + qkv_weights = torch.cat(qkv_weights_l) + assert qkv_weights.ndim == 3, qkv_weights.shape + assert ( + qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups + ), qkv_weights.shape + assert qkv_weights.shape[1] == head_size, qkv_weights.shape + assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape + + qkv_weights = qkv_weights.reshape( + [head_size * (head_num + 2 * num_query_groups), hidden_size] + ) + + return qkv_weights + + +@io.state_transform( + source_key="decoder.layers.*.self_attention.linear_qkv.weight", + target_key=( + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", + ), +) +def _export_qkv(ctx: io.TransformCTX, linear_qkv): + megatron_config = ctx.source.config + + head_num = megatron_config.num_attention_heads + num_query_groups = megatron_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = megatron_config.hidden_size + head_num = megatron_config.num_attention_heads + head_size = hidden_size // head_num + qkv_total_dim = head_num + 2 * num_query_groups + + linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size]) + q_slice = torch.cat( + [ + torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group) + for i in range(num_query_groups) + ] + ) + k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2)) + v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2)) + + q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu() + k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu() + v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu() + + return q_proj, k_proj, v_proj + + +@io.state_transform( + source_key=( + "model.layers.*.mlp.gate_proj.weight", + "model.layers.*.mlp.up_proj.weight" + ), + target_key="decoder.layers.*.mlp.linear_fc1.weight" +) +def _import_linear_fc1(down, gate): + return torch.cat((down, gate), axis=0).float() + + +@io.state_transform( + source_key="decoder.layers.*.mlp.linear_fc1.weight", + target_key=( + "model.layers.*.mlp.gate_proj.weight", + "model.layers.*.mlp.up_proj.weight" + ), +) +def _export_linear_fc1(linear_fc1): + gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0) + + return gate_proj, up_proj + + +__all__ = [ + "GemmaConfig", + "GemmaConfig2B", + "GemmaConfig7B", + "CodeGemmaConfig2B", + "CodeGemmaConfig7B", + "GemmaModel", +] \ No newline at end of file From b8d13c27cec172b38e06991c0e82680e47535466 Mon Sep 17 00:00:00 2001 From: cuichenx Date: Mon, 24 Jun 2024 22:42:54 +0000 Subject: [PATCH 11/13] Apply isort and black reformatting Signed-off-by: cuichenx --- nemo/collections/llm/__init__.py | 12 ++-- nemo/collections/llm/gpt/model/__init__.py | 2 +- nemo/collections/llm/gpt/model/gemma.py | 76 ++++++++-------------- 3 files changed, 34 insertions(+), 56 deletions(-) diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index 2fbac2304569..19911b544f43 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -13,10 +13,16 @@ SquadDataModule, ) from nemo.collections.llm.gpt.model import ( + CodeGemmaConfig2B, + CodeGemmaConfig7B, CodeLlamaConfig7B, CodeLlamaConfig13B, CodeLlamaConfig34B, CodeLlamaConfig70B, + GemmaConfig, + GemmaConfig2B, + GemmaConfig7B, + GemmaModel, GPTConfig, GPTModel, Llama2Config7B, @@ -26,12 +32,6 @@ Llama3Config70B, LlamaConfig, LlamaModel, - GemmaConfig, - GemmaConfig2B, - GemmaConfig7B, - CodeGemmaConfig2B, - CodeGemmaConfig7B, - GemmaModel, MaskedTokenLossReduction, Mistral7BConfig, Mistral7BModel, diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index 334ee05bd5df..2da72539fd15 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -5,8 +5,8 @@ gpt_data_step, gpt_forward_step, ) -from nemo.collections.llm.gpt.model.llama import * from nemo.collections.llm.gpt.model.gemma import * +from nemo.collections.llm.gpt.model.llama import * from nemo.collections.llm.gpt.model.mistral_7b import Mistral7BConfig, Mistral7BModel from nemo.collections.llm.gpt.model.mixtral import MixtralConfig, MixtralModel diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py index 8e0c4f54278f..f933676c0c63 100644 --- a/nemo/collections/llm/gpt/model/gemma.py +++ b/nemo/collections/llm/gpt/model/gemma.py @@ -1,21 +1,19 @@ from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Callable, Generic, Optional, Type, TypeVar, Annotated +from typing import TYPE_CHECKING, Annotated, Callable, Generic, Optional, Type, TypeVar import torch -from nemo.collections.nlp.modules.common.megatron.utils import openai_gelu -from nemo.lightning import io, OptimizerModule, teardown -from nemo.lightning.io.connector import TargetT from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel from nemo.collections.llm.utils import Config - +from nemo.collections.nlp.modules.common.megatron.utils import openai_gelu +from nemo.lightning import OptimizerModule, io, teardown +from nemo.lightning.io.connector import TargetT if TYPE_CHECKING: + from transformers import GemmaForCausalLM + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - from transformers import ( - GemmaForCausalLM, - ) from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec @@ -74,6 +72,7 @@ def __init__( ): super().__init__(config or GemmaConfig(), optim=optim, tokenizer=tokenizer) + @io.model_importer(GemmaModel, "hf") class HFGemmaImporter(io.ModelConnector["GemmaForCausalLM", GemmaModel]): def init(self) -> GemmaModel: @@ -105,12 +104,7 @@ def convert_state(self, source, target): "model.norm.weight": "decoder.final_layernorm.weight", } - return io.apply_transforms( - source, - target, - mapping=mapping, - transforms=[_import_qkv, _import_linear_fc1] - ) + return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1]) @property def tokenizer(self) -> "AutoTokenizer": @@ -121,6 +115,7 @@ def tokenizer(self) -> "AutoTokenizer": @property def config(self) -> GemmaConfig: from transformers import GemmaConfig as HFGemmaConfig + source = HFGemmaConfig.from_pretrained(str(self)) def make_vocab_size_divisible_by(vocab_size): @@ -174,12 +169,7 @@ def convert_state(self, source, target): "decoder.final_layernorm.weight": "model.norm.weight", } - return io.apply_transforms( - source, - target, - mapping=mapping, - transforms=[_export_qkv, _export_linear_fc1] - ) + return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1]) @property def tokenizer(self): @@ -200,17 +190,17 @@ def config(self) -> "GemmaConfig": initializer_range=source.init_method_std, rms_norm_eps=source.layernorm_epsilon, num_key_value_heads=source.num_query_groups, - vocab_size=self.tokenizer.vocab_size + vocab_size=self.tokenizer.vocab_size, ) @io.state_transform( source_key=( - "model.layers.*.self_attn.q_proj.weight", - "model.layers.*.self_attn.k_proj.weight", - "model.layers.*.self_attn.v_proj.weight", + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", ), - target_key="decoder.layers.*.self_attention.linear_qkv.weight" + target_key="decoder.layers.*.self_attention.linear_qkv.weight", ) def _import_qkv(ctx: io.TransformCTX, q, k, v): megatron_config = ctx.target.config @@ -232,22 +222,16 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v): qkv_weights_l = [] for i in range(num_query_groups): - qkv_weights_l.append( - q[i * heads_per_group: (i + 1) * heads_per_group, :, :] - ) - qkv_weights_l.append(k[i: i + 1, :, :]) - qkv_weights_l.append(v[i: i + 1, :, :]) + qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :]) + qkv_weights_l.append(k[i : i + 1, :, :]) + qkv_weights_l.append(v[i : i + 1, :, :]) qkv_weights = torch.cat(qkv_weights_l) assert qkv_weights.ndim == 3, qkv_weights.shape - assert ( - qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups - ), qkv_weights.shape + assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape assert qkv_weights.shape[1] == head_size, qkv_weights.shape assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape - qkv_weights = qkv_weights.reshape( - [head_size * (head_num + 2 * num_query_groups), hidden_size] - ) + qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size]) return qkv_weights @@ -255,9 +239,9 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v): @io.state_transform( source_key="decoder.layers.*.self_attention.linear_qkv.weight", target_key=( - "model.layers.*.self_attn.q_proj.weight", - "model.layers.*.self_attn.k_proj.weight", - "model.layers.*.self_attn.v_proj.weight", + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", ), ) def _export_qkv(ctx: io.TransformCTX, linear_qkv): @@ -289,11 +273,8 @@ def _export_qkv(ctx: io.TransformCTX, linear_qkv): @io.state_transform( - source_key=( - "model.layers.*.mlp.gate_proj.weight", - "model.layers.*.mlp.up_proj.weight" - ), - target_key="decoder.layers.*.mlp.linear_fc1.weight" + source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"), + target_key="decoder.layers.*.mlp.linear_fc1.weight", ) def _import_linear_fc1(down, gate): return torch.cat((down, gate), axis=0).float() @@ -301,10 +282,7 @@ def _import_linear_fc1(down, gate): @io.state_transform( source_key="decoder.layers.*.mlp.linear_fc1.weight", - target_key=( - "model.layers.*.mlp.gate_proj.weight", - "model.layers.*.mlp.up_proj.weight" - ), + target_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"), ) def _export_linear_fc1(linear_fc1): gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0) @@ -319,4 +297,4 @@ def _export_linear_fc1(linear_fc1): "CodeGemmaConfig2B", "CodeGemmaConfig7B", "GemmaModel", -] \ No newline at end of file +] From aed0b7c0b7403ee2d83ef8f47840be9c3be43c8e Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Mon, 24 Jun 2024 16:21:59 -0700 Subject: [PATCH 12/13] checks Signed-off-by: Chen Cui --- nemo/collections/llm/gpt/model/gemma.py | 65 ++++++++++++++++--------- nemo/collections/llm/gpt/model/llama.py | 2 +- 2 files changed, 43 insertions(+), 24 deletions(-) diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py index f933676c0c63..b45099996327 100644 --- a/nemo/collections/llm/gpt/model/gemma.py +++ b/nemo/collections/llm/gpt/model/gemma.py @@ -1,6 +1,6 @@ from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Annotated, Callable, Generic, Optional, Type, TypeVar +from typing import TYPE_CHECKING, Annotated, Callable, Optional import torch @@ -8,7 +8,6 @@ from nemo.collections.llm.utils import Config from nemo.collections.nlp.modules.common.megatron.utils import openai_gelu from nemo.lightning import OptimizerModule, io, teardown -from nemo.lightning.io.connector import TargetT if TYPE_CHECKING: from transformers import GemmaForCausalLM @@ -72,7 +71,6 @@ def __init__( ): super().__init__(config or GemmaConfig(), optim=optim, tokenizer=tokenizer) - @io.model_importer(GemmaModel, "hf") class HFGemmaImporter(io.ModelConnector["GemmaForCausalLM", GemmaModel]): def init(self) -> GemmaModel: @@ -104,7 +102,12 @@ def convert_state(self, source, target): "model.norm.weight": "decoder.final_layernorm.weight", } - return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1]) + return io.apply_transforms( + source, + target, + mapping=mapping, + transforms=[_import_qkv, _import_linear_fc1] + ) @property def tokenizer(self) -> "AutoTokenizer": @@ -115,7 +118,6 @@ def tokenizer(self) -> "AutoTokenizer": @property def config(self) -> GemmaConfig: from transformers import GemmaConfig as HFGemmaConfig - source = HFGemmaConfig.from_pretrained(str(self)) def make_vocab_size_divisible_by(vocab_size): @@ -169,7 +171,12 @@ def convert_state(self, source, target): "decoder.final_layernorm.weight": "model.norm.weight", } - return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1]) + return io.apply_transforms( + source, + target, + mapping=mapping, + transforms=[_export_qkv, _export_linear_fc1] + ) @property def tokenizer(self): @@ -190,17 +197,17 @@ def config(self) -> "GemmaConfig": initializer_range=source.init_method_std, rms_norm_eps=source.layernorm_epsilon, num_key_value_heads=source.num_query_groups, - vocab_size=self.tokenizer.vocab_size, + vocab_size=self.tokenizer.vocab_size ) @io.state_transform( source_key=( - "model.layers.*.self_attn.q_proj.weight", - "model.layers.*.self_attn.k_proj.weight", - "model.layers.*.self_attn.v_proj.weight", + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", ), - target_key="decoder.layers.*.self_attention.linear_qkv.weight", + target_key="decoder.layers.*.self_attention.linear_qkv.weight" ) def _import_qkv(ctx: io.TransformCTX, q, k, v): megatron_config = ctx.target.config @@ -222,16 +229,22 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v): qkv_weights_l = [] for i in range(num_query_groups): - qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :]) - qkv_weights_l.append(k[i : i + 1, :, :]) - qkv_weights_l.append(v[i : i + 1, :, :]) + qkv_weights_l.append( + q[i * heads_per_group: (i + 1) * heads_per_group, :, :] + ) + qkv_weights_l.append(k[i: i + 1, :, :]) + qkv_weights_l.append(v[i: i + 1, :, :]) qkv_weights = torch.cat(qkv_weights_l) assert qkv_weights.ndim == 3, qkv_weights.shape - assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape + assert ( + qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups + ), qkv_weights.shape assert qkv_weights.shape[1] == head_size, qkv_weights.shape assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape - qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size]) + qkv_weights = qkv_weights.reshape( + [head_size * (head_num + 2 * num_query_groups), hidden_size] + ) return qkv_weights @@ -239,9 +252,9 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v): @io.state_transform( source_key="decoder.layers.*.self_attention.linear_qkv.weight", target_key=( - "model.layers.*.self_attn.q_proj.weight", - "model.layers.*.self_attn.k_proj.weight", - "model.layers.*.self_attn.v_proj.weight", + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", ), ) def _export_qkv(ctx: io.TransformCTX, linear_qkv): @@ -273,8 +286,11 @@ def _export_qkv(ctx: io.TransformCTX, linear_qkv): @io.state_transform( - source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"), - target_key="decoder.layers.*.mlp.linear_fc1.weight", + source_key=( + "model.layers.*.mlp.gate_proj.weight", + "model.layers.*.mlp.up_proj.weight" + ), + target_key="decoder.layers.*.mlp.linear_fc1.weight" ) def _import_linear_fc1(down, gate): return torch.cat((down, gate), axis=0).float() @@ -282,7 +298,10 @@ def _import_linear_fc1(down, gate): @io.state_transform( source_key="decoder.layers.*.mlp.linear_fc1.weight", - target_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"), + target_key=( + "model.layers.*.mlp.gate_proj.weight", + "model.layers.*.mlp.up_proj.weight" + ), ) def _export_linear_fc1(linear_fc1): gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0) @@ -297,4 +316,4 @@ def _export_linear_fc1(linear_fc1): "CodeGemmaConfig2B", "CodeGemmaConfig7B", "GemmaModel", -] +] \ No newline at end of file diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py index 210101e1b206..aa089b077041 100644 --- a/nemo/collections/llm/gpt/model/llama.py +++ b/nemo/collections/llm/gpt/model/llama.py @@ -1,6 +1,6 @@ from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Annotated, Callable, Optional, Type +from typing import TYPE_CHECKING, Annotated, Callable, Optional import torch import torch.nn.functional as F From 189764d856c72b1d101d91792d58b55d9ea10e75 Mon Sep 17 00:00:00 2001 From: cuichenx Date: Mon, 24 Jun 2024 23:22:36 +0000 Subject: [PATCH 13/13] Apply isort and black reformatting Signed-off-by: cuichenx --- nemo/collections/llm/gpt/model/gemma.py | 62 +++++++++---------------- 1 file changed, 21 insertions(+), 41 deletions(-) diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py index b45099996327..ff9772b1b74c 100644 --- a/nemo/collections/llm/gpt/model/gemma.py +++ b/nemo/collections/llm/gpt/model/gemma.py @@ -71,6 +71,7 @@ def __init__( ): super().__init__(config or GemmaConfig(), optim=optim, tokenizer=tokenizer) + @io.model_importer(GemmaModel, "hf") class HFGemmaImporter(io.ModelConnector["GemmaForCausalLM", GemmaModel]): def init(self) -> GemmaModel: @@ -102,12 +103,7 @@ def convert_state(self, source, target): "model.norm.weight": "decoder.final_layernorm.weight", } - return io.apply_transforms( - source, - target, - mapping=mapping, - transforms=[_import_qkv, _import_linear_fc1] - ) + return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1]) @property def tokenizer(self) -> "AutoTokenizer": @@ -118,6 +114,7 @@ def tokenizer(self) -> "AutoTokenizer": @property def config(self) -> GemmaConfig: from transformers import GemmaConfig as HFGemmaConfig + source = HFGemmaConfig.from_pretrained(str(self)) def make_vocab_size_divisible_by(vocab_size): @@ -171,12 +168,7 @@ def convert_state(self, source, target): "decoder.final_layernorm.weight": "model.norm.weight", } - return io.apply_transforms( - source, - target, - mapping=mapping, - transforms=[_export_qkv, _export_linear_fc1] - ) + return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1]) @property def tokenizer(self): @@ -197,17 +189,17 @@ def config(self) -> "GemmaConfig": initializer_range=source.init_method_std, rms_norm_eps=source.layernorm_epsilon, num_key_value_heads=source.num_query_groups, - vocab_size=self.tokenizer.vocab_size + vocab_size=self.tokenizer.vocab_size, ) @io.state_transform( source_key=( - "model.layers.*.self_attn.q_proj.weight", - "model.layers.*.self_attn.k_proj.weight", - "model.layers.*.self_attn.v_proj.weight", + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", ), - target_key="decoder.layers.*.self_attention.linear_qkv.weight" + target_key="decoder.layers.*.self_attention.linear_qkv.weight", ) def _import_qkv(ctx: io.TransformCTX, q, k, v): megatron_config = ctx.target.config @@ -229,22 +221,16 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v): qkv_weights_l = [] for i in range(num_query_groups): - qkv_weights_l.append( - q[i * heads_per_group: (i + 1) * heads_per_group, :, :] - ) - qkv_weights_l.append(k[i: i + 1, :, :]) - qkv_weights_l.append(v[i: i + 1, :, :]) + qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :]) + qkv_weights_l.append(k[i : i + 1, :, :]) + qkv_weights_l.append(v[i : i + 1, :, :]) qkv_weights = torch.cat(qkv_weights_l) assert qkv_weights.ndim == 3, qkv_weights.shape - assert ( - qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups - ), qkv_weights.shape + assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape assert qkv_weights.shape[1] == head_size, qkv_weights.shape assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape - qkv_weights = qkv_weights.reshape( - [head_size * (head_num + 2 * num_query_groups), hidden_size] - ) + qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size]) return qkv_weights @@ -252,9 +238,9 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v): @io.state_transform( source_key="decoder.layers.*.self_attention.linear_qkv.weight", target_key=( - "model.layers.*.self_attn.q_proj.weight", - "model.layers.*.self_attn.k_proj.weight", - "model.layers.*.self_attn.v_proj.weight", + "model.layers.*.self_attn.q_proj.weight", + "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", ), ) def _export_qkv(ctx: io.TransformCTX, linear_qkv): @@ -286,11 +272,8 @@ def _export_qkv(ctx: io.TransformCTX, linear_qkv): @io.state_transform( - source_key=( - "model.layers.*.mlp.gate_proj.weight", - "model.layers.*.mlp.up_proj.weight" - ), - target_key="decoder.layers.*.mlp.linear_fc1.weight" + source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"), + target_key="decoder.layers.*.mlp.linear_fc1.weight", ) def _import_linear_fc1(down, gate): return torch.cat((down, gate), axis=0).float() @@ -298,10 +281,7 @@ def _import_linear_fc1(down, gate): @io.state_transform( source_key="decoder.layers.*.mlp.linear_fc1.weight", - target_key=( - "model.layers.*.mlp.gate_proj.weight", - "model.layers.*.mlp.up_proj.weight" - ), + target_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"), ) def _export_linear_fc1(linear_fc1): gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0) @@ -316,4 +296,4 @@ def _export_linear_fc1(linear_fc1): "CodeGemmaConfig2B", "CodeGemmaConfig7B", "GemmaModel", -] \ No newline at end of file +]