diff --git a/scripts/checkpoint_converters/convert_gemma_nemo_to_hf.py b/scripts/checkpoint_converters/convert_gemma_nemo_to_hf.py new file mode 100644 index 000000000000..f1267d511728 --- /dev/null +++ b/scripts/checkpoint_converters/convert_gemma_nemo_to_hf.py @@ -0,0 +1,342 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from argparse import ArgumentParser +from collections import OrderedDict + +import torch +from omegaconf import open_dict +from pytorch_lightning import Trainer +from transformers import AutoModelForCausalLM, GemmaTokenizer, GemmaTokenizerFast, convert_slow_tokenizer + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy +from nemo.utils import logging + +""" +Script to convert a gemma checkpoint in nemo (mcore path) into a HuggingFace checkpoint. +This script can be used to 1) generate only the HF weights, or 2) generate an entire HF model folder. +This script is adapted from convert_llama_nemo_to_hf.py + +1) Generate only HF weights from a nemo file: + + python convert_gemma_nemo_to_hf.py \ + --input_name_or_path /workspace/pretrained/HF_TO_NEMO/gemma-2b-it \ + --output_path /workspace/pretrained/NEMO_TO_HF/gemma-2b-it/pytorch_model.bin + +2) Generate the full HF model folder + + python convert_gemma_nemo_to_hf.py \ + --input_name_or_path /workspace/pretrained/HF_TO_NEMO/gemma-2b-it \ + --output_path /workspace/pretrained/NEMO_TO_HF/gemma-2b-it/pytorch_model.bin \ + --hf_input_path /workspace/pretrained/HF_MODELS/gemma-2b-it \ + --hf_output_path /workspace/pretrained/NEMO_TO_HF/gemma-2b-it \ + --input_tokenizer /workspace/pretrained/HF_MODELS/gemma-2b-it \ + --hf_output_tokenizer /workspace/pretrained/NEMO_TO_HF/gemma-2b-it \ + --precision 32 + + Use the --cpu-only flag if the model cannot fit in the GPU (e.g. Llama2 70b). + However this option makes the conversion script significantly slower. +""" + + +def get_args(): + parser = ArgumentParser() + parser.add_argument( + "--input_name_or_path", + type=str, + default=None, + required=True, + help="Path to .nemo file or extracted folder", + ) + parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file") + parser.add_argument( + "--hf_input_path", + type=str, + default=None, + help="A HF model path, " "e.g. a folder containing https://huggingface.co/meta-llama/Llama-2-7b-hf/tree/main", + ) + parser.add_argument( + "--hf_output_path", + type=str, + default=None, + help="Output HF model path, " "with the same format as above but user's own weights", + ) + parser.add_argument( + "--input_tokenizer", + type=str, + default=None, + help="Path to tokenizer used for the input nemo model. (need to extract the .nemo file first)", + ) + parser.add_argument( + "--hf_output_tokenizer", + type=str, + default=None, + help="Path to save the tokenizer used for the output HF model.", + ) + parser.add_argument( + "--precision", + type=str, + default=None, + help="Precision of output weights." + "Defaults to precision of the input nemo weights (model.cfg.trainer.precision)", + ) + parser.add_argument( + "--cpu-only", + action="store_true", + help="Load model in cpu only. Useful if the model cannot fit in GPU memory, " + "but this option makes the conversion script significantly slower.", + ) + args = parser.parse_args() + return args + + +def verify_forward(model_path, tokenizer_path, model_string): + logging.info(f"=" * 100) + logging.info(f"Verifying forward pass for {model_string}") + + input_texts = [ + 'query: how much protein should an adult eat', + ] + logging.info(f"Running verifications {input_texts} ...") + + tokenizer = GemmaTokenizer.from_pretrained(tokenizer_path, local_files_only=True) + tokenizer.pad_token = tokenizer.eos_token + batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors="pt") + batch_dict_cuda = {k: v.cuda() for k, v in batch_dict.items()} + + if model_string == "hf": + model = AutoModelForCausalLM.from_pretrained(model_path, local_files_only=True) + model = model.cuda().eval() + outputs = model(**batch_dict_cuda, output_hidden_states=True) + next_token = outputs.logits[0, -1].argmax() + elif model_string == 'nemo': + dummy_trainer = Trainer(devices=1, accelerator='auto', strategy=NLPDDPStrategy()) + model_config = MegatronGPTModel.restore_from(model_path, trainer=dummy_trainer, return_config=True) + model_config.tensor_model_parallel_size = 1 + model_config.pipeline_model_parallel_size = 1 + model = MegatronGPTModel.restore_from( + model_path, trainer=dummy_trainer, override_config_path=model_config, map_location=None + ) + + ids = batch_dict_cuda['input_ids'] + id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids.cpu()] + masks_and_position_ids = [ + get_ltor_masks_and_position_ids(id_tensor, tokenizer.eos_token, False, False, False) + for id_tensor in id_tensors + ] + + for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids): + attn_mask, _, pos_ids = attn_mask_and_pos_ids + + outputs = model( + tokens=tokens, text_position_ids=pos_ids.cuda(), attention_mask=attn_mask.cuda(), labels=None + ) + next_token = outputs.squeeze()[-1].argmax() + else: + raise ValueError(f"Model string {model_string} not recognized.") + + logging.info(f"{model_string} predicted next token is: '{tokenizer.convert_ids_to_tokens([next_token])}'.") + logging.info(f"=" * 100) + + +def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> None: + """ + Convert NeMo weights to HF weights + """ + dummy_trainer = Trainer(devices=1, accelerator='cpu', strategy=NLPDDPStrategy()) + model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) + model_config.tensor_model_parallel_size = 1 + model_config.pipeline_model_parallel_size = 1 + if cpu_only: + map_location = torch.device('cpu') + model_config.use_cpu_initialization = True + else: + map_location = None + + if cpu_only: + logging.info("******** Loading model on CPU. This will take a significant amount of time.") + model = MegatronGPTModel.restore_from( + input_nemo_file, trainer=dummy_trainer, override_config_path=model_config, map_location=map_location + ) + if precision is None: + precision = model.cfg.precision + if precision in [32, "32"]: + dtype = torch.float32 + elif precision in [16, "16", "16-mixed"]: + dtype = torch.float16 + elif precision in ["bf16", "bf16-mixed"]: + dtype = torch.bfloat16 + else: + logging.warning(f"Precision string {precision} is not recognized, falling back to fp32") + dtype = torch.float32 # fallback + logging.info(f"Using precision {dtype}") + + param_to_weights = lambda param: param.to(dtype) + checkpoint = OrderedDict() + + hidden_size = model.cfg.hidden_size + head_num = model.cfg.num_attention_heads + num_layers = model.cfg.num_layers + ffn_hidden_size = model.cfg.ffn_hidden_size + num_query_groups = model.cfg.get("num_query_groups", head_num) # different num_query_groups for 70B + + head_size = hidden_size // head_num + heads_per_group = head_num // num_query_groups + qkv_total_dim = head_num + 2 * num_query_groups + + # Embedding + embed_weight = model.state_dict()[f'model.embedding.word_embeddings.weight'] + embed_weights_base_name = f'model.embed_tokens.weight' + checkpoint[embed_weights_base_name] = param_to_weights(embed_weight) + for l in range(int(num_layers)): + print(f"converting layer {l}") + + qkv_weights = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv.weight'] + qkv_weights = qkv_weights.reshape([qkv_total_dim, head_size, hidden_size]) + + q_slice = torch.cat( + [ + torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group) + for i in range(num_query_groups) + ] + ) + k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2)) + v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2)) + ## Example of slices + ## 7b: num_query_groups = head_num = 32, + ## q_slice = [0, 3, 6, 9 , ... 90, 93] + ## k_slice = [1, 4, 7, 10, ... 91, 94] + ## v_slice = [2, 5, 8, 11, ... 92, 95] + ## 70b (with GQA): num_query_groups = 8, head_num = 64 + ## q_slice = [0, 1, .. 6, 7, 10, 11, .. 16, 17, 20, 21, .. 67, 70, ... 76, 77] + ## k_slice = [8, 18, 28, ... 68, 78] + ## v_slice = [9, 19, 29, ... 69, 79] + + q_weights_base_name = f'model.layers.{l}.self_attn.q_proj.weight' + k_weights_base_name = f'model.layers.{l}.self_attn.k_proj.weight' + v_weights_base_name = f'model.layers.{l}.self_attn.v_proj.weight' + + checkpoint[q_weights_base_name] = param_to_weights(qkv_weights[q_slice].reshape(-1, hidden_size)) + checkpoint[k_weights_base_name] = param_to_weights(qkv_weights[k_slice].reshape(-1, hidden_size)) + checkpoint[v_weights_base_name] = param_to_weights(qkv_weights[v_slice].reshape(-1, hidden_size)) + + # attention dense + o_weight = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_proj.weight'] + o_weight_base_name = f'model.layers.{l}.self_attn.o_proj.weight' + checkpoint[o_weight_base_name] = param_to_weights(o_weight) + + # mlp + mlp_weights = model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc1.weight'] + mlp_down_proj_weight = mlp_weights[:ffn_hidden_size, :] + mlp_gate_proj_weight = mlp_weights[ffn_hidden_size:, :] + + mlp_down_proj_base_name = f'model.layers.{l}.mlp.gate_proj.weight' + mlp_gate_proj_base_name = f'model.layers.{l}.mlp.up_proj.weight' + + checkpoint[mlp_down_proj_base_name] = param_to_weights(mlp_down_proj_weight) + checkpoint[mlp_gate_proj_base_name] = param_to_weights(mlp_gate_proj_weight) + + mlp_up_proj_weight = model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc2.weight'] + mlp_up_proj_base_name = f'model.layers.{l}.mlp.down_proj.weight' + checkpoint[mlp_up_proj_base_name] = param_to_weights(mlp_up_proj_weight) + + # layernorm + input_ln_weight = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight'] + input_ln_base_name = f'model.layers.{l}.input_layernorm.weight' + checkpoint[input_ln_base_name] = param_to_weights(input_ln_weight - 1.0) + + post_attn_ln_weight = model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight'] + post_attn_ln_base_name = f'model.layers.{l}.post_attention_layernorm.weight' + checkpoint[post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight - 1.0) + + print(f"done layer {l}") + + final_ln_weight = model.state_dict()[f'model.decoder.final_layernorm.weight'] + final_ln_base_name = f'model.norm.weight' + checkpoint[final_ln_base_name] = param_to_weights(final_ln_weight - 1.0) + + # NOTE: Gemmas uses weight tying + output_layer_weight = model.state_dict()[ + f'model.embedding.word_embeddings.weight' + ] # model.state_dict()[f'model.output_layer.weight'] + output_layer_base_name = f'lm_head.weight' + checkpoint[output_layer_base_name] = param_to_weights(output_layer_weight) + + os.makedirs(os.path.dirname(output_hf_file), exist_ok=True) + torch.save(checkpoint, output_hf_file) + logging.info(f"Weights saved to {output_hf_file}") + + return dtype + + +def replace_hf_weights_and_tokenizer( + weights_file, + dtype, + input_hf_path, + output_hf_path, + tokenizer_path, + output_hf_tokenizer, +): + model = AutoModelForCausalLM.from_pretrained( + input_hf_path, + local_files_only=True, + torch_dtype=dtype, + ) + nemo_exported = torch.load(weights_file) + + if tokenizer_path: + tokenizer = GemmaTokenizer.from_pretrained( + tokenizer_path, + local_files_only=True, + legacy=False, + ) + tmp_tokenizer = convert_slow_tokenizer.convert_slow_tokenizer(tokenizer) + fast_tokenizer = GemmaTokenizerFast(tokenizer_object=tmp_tokenizer) + tokenizer_length = len(fast_tokenizer) + model.resize_token_embeddings(tokenizer_length) + + model.load_state_dict(nemo_exported) + model.save_pretrained(output_hf_path) + logging.info(f"Full HF model saved to {output_hf_path}") + + if tokenizer_path: + fast_tokenizer.save_pretrained(output_hf_tokenizer) + tokenizer.save_pretrained(output_hf_tokenizer) + logging.info(f"Tokenizer saved to {output_hf_tokenizer}") + + +if __name__ == '__main__': + args = get_args() + if not args.hf_output_tokenizer and args.hf_output_path: + args.hf_output_tokenizer = args.hf_output_path + # dtype = convert(args.input_name_or_path, args.output_path, precision=args.precision, cpu_only=args.cpu_only) + if args.hf_input_path and args.hf_output_path: + """ + replace_hf_weights_and_tokenizer( + args.output_path, + dtype, + args.hf_input_path, + args.hf_output_path, + args.input_tokenizer, + args.hf_output_tokenizer, + ) + """ + verify_forward(args.input_name_or_path, args.hf_output_tokenizer, "nemo") + verify_forward(args.hf_output_path, args.hf_output_tokenizer, "hf") + else: + logging.info("`hf_input_path` and/or `hf_output_path` not provided, not generating full HF model.") + logging.info(f".bin file is saved to {args.output_path}")