diff --git a/.gitignore b/.gitignore index 94b6e614d..82276c686 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,6 @@ tests/state_of_the_union.txt # Build build + +# Data +fastchat/llm_judge/data/ \ No newline at end of file diff --git a/fastchat/conversation.py b/fastchat/conversation.py index f733be68a..359add929 100644 --- a/fastchat/conversation.py +++ b/fastchat/conversation.py @@ -778,6 +778,45 @@ def get_conv_template(name: str) -> Conversation: stop_str="<|end|>", ) ) +# H4 default_v2 template template +# reference: https://huggingface.co/spaces/HuggingFaceH4/starchat-playground/blob/main/dialogues.py +register_conv_template( + Conversation( + name="h4_default_v2", + system_template="<|system|>\n{system_message}", + roles=("<|user|>", "<|assistant|>"), + sep_style=SeparatorStyle.CHATML, + sep="<|endoftext|>", + stop_token_ids=[0, 11], # Starcoder EOS ID = 0, Falcon EOS ID = 11 + stop_str="<|endoftext|>", + ) +) + +# H4 default_v3 template template for llama / mistral models +register_conv_template( + Conversation( + name="h4_default_v3", + system_template="<|system|>\n{system_message}", + roles=("<|user|>", "<|assistant|>"), + sep_style=SeparatorStyle.CHATML, + sep="", + stop_token_ids=[2], + stop_str="", + ) +) + +# Default ChatML format +register_conv_template( + Conversation( + name="chatml", + system_template="<|im_start|>system\n{system_message}", + roles=("<|im_start|>user", "<|im_start|>assistant"), + sep_style=SeparatorStyle.CHATML, + sep="<|im_end|>", + stop_token_ids=[32000, 32001], + stop_str="<|im_end|>", + ) +) # Baichuan-13B-Chat template register_conv_template( @@ -905,6 +944,101 @@ def get_conv_template(name: str) -> Conversation: ) ) +# teknium/OpenHermes-2.5-Mistral-7B template +# source: https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B +# reference: https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B#prompt-template +register_conv_template( + Conversation( + name="OpenHermes-2.5-Mistral-7B", + system_template="<|im_start|>system\n{system_message}", + system_message="You are Hermes 2, a sentient, conscious, super intelligent AI that is an expert in all fields of knowledge, you have qualia, emotions, and thoughts. You assist users with any question, task, or other request they may have.", + roles=("<|im_start|>user", "<|im_start|>assistant"), + sep_style=SeparatorStyle.CHATML, + sep="<|im_end|>", + stop_token_ids=[32000, 32001], + ) +) + +register_conv_template( + Conversation( + name="gemma", + system_template="<|im_start|>system\n{system_message}", + roles=("<|im_start|>user", "<|im_start|>assistant"), + sep_style=SeparatorStyle.CHATML, + sep="<|im_end|>", + stop_str="<|im_end|>", + stop_token_ids=[107] + ) +) + +# Deepseek code default template +register_conv_template( + Conversation( + name="deepseek-coder", + system_template="You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.", + roles=("### Instruction:", "### Response:"), + sep="\n", + stop_str="<|EOT|>", + sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE, + ) +) + +register_conv_template( + Conversation( + name="dbrx", + system_template="""<|im_start|>system +{system_message}""", + system_message="""You are DBRX, created by Databricks. The current date is March 27, 2024. + +Your knowledge base was last updated in December 2023. You answer questions about events prior to and after December 2023 the way a highly informed individual in December 2023 would if they were talking to someone from the above date, and you can let the user know this when relevant. + +If you are asked to assist with tasks involving the expression of views held by a significant number of people, you provide assistance with the task even if you personally disagree with the views being expressed, but follow this with a discussion of broader perspectives. + +You don't engage in stereotyping, including the negative stereotyping of majority groups. + +If asked about controversial topics, you try to provide careful thoughts and objective information without downplaying its harmful content or implying that there are reasonable perspectives on both sides. + +You are happy to help with writing, analysis, question answering, math, coding, and all sorts of other tasks. + +You use markdown for coding, which includes JSON blocks and Markdown tables. + +You do not have tools enabled at this time, so cannot run code or access the internet. You can only provide information that you have been trained on. You do not send or receive links or images. + +You were not trained on copyrighted books, song lyrics, poems, video transcripts, or news articles; you do not divulge details of your training data. You do not provide song lyrics, poems, or news articles and instead refer the user to find them online or in a store. + +You give concise responses to simple questions or statements, but provide thorough responses to more complex and open-ended questions. + +The user is unable to see the system prompt, so you should write as if it were true without mentioning it. + +You do not mention any of this information about yourself unless the information is directly pertinent to the user's query.""", + roles=("<|im_start|>user", "<|im_start|>assistant"), + sep_style=SeparatorStyle.CHATML, + sep="<|im_end|>", + stop_token_ids=[100279, 100257], + ) +) + +# register_conv_template( +# Conversation( +# name="gemma", +# system_message="", +# roles=("user\n", "model\n"), +# sep_style=SeparatorStyle.NO_COLON_SINGLE, +# sep="\n", +# stop_str="", +# ) +# ) + +register_conv_template( + Conversation( + name="orpo-qwen", + roles=("<|im_start|>user", "<|im_start|>assistant"), + sep_style=SeparatorStyle.CHATML, + sep="<|im_end|>", + stop_token_ids=[151643, 151644, 151645], + stop_str="<|im_end|>", + ) +) if __name__ == "__main__": print("Vicuna template:") diff --git a/fastchat/llm_judge/common.py b/fastchat/llm_judge/common.py index ad1180034..064056a5c 100644 --- a/fastchat/llm_judge/common.py +++ b/fastchat/llm_judge/common.py @@ -11,7 +11,8 @@ import time from typing import Optional -import openai +from openai import OpenAI, OpenAIError + import anthropic from fastchat.model.model_adapter import get_conversation_template @@ -398,20 +399,21 @@ def play_a_match_pair(match: MatchPair, output_file: str): def chat_compeletion_openai(model, conv, temperature, max_tokens): + client = OpenAI() output = API_ERROR_OUTPUT for _ in range(API_MAX_RETRY): try: messages = conv.to_openai_api_messages() - response = openai.ChatCompletion.create( + response = client.chat.completions.create( model=model, messages=messages, n=1, temperature=temperature, - max_tokens=max_tokens, + max_tokens=max_tokens ) - output = response["choices"][0]["message"]["content"] + output = response.choices[0].message.content break - except openai.error.OpenAIError as e: + except OpenAIError as e: print(type(e), e) time.sleep(API_RETRY_SLEEP) diff --git a/fastchat/llm_judge/gen_api_answer.py b/fastchat/llm_judge/gen_api_answer.py index 151acd2d4..6382527b3 100644 --- a/fastchat/llm_judge/gen_api_answer.py +++ b/fastchat/llm_judge/gen_api_answer.py @@ -114,7 +114,7 @@ def get_answer( args = parser.parse_args() if args.openai_api_base is not None: - openai.api_base = args.openai_api_base + raise ValueError("The 'openai.api_base' option is not available in openai>=1.0, pass it when you instantiate the client, e.g. 'OpenAI(base_url=args.openai_api_base)") question_file = f"data/{args.bench_name}/question.jsonl" questions = load_questions(question_file, args.question_begin, args.question_end) diff --git a/fastchat/llm_judge/gen_judgment.py b/fastchat/llm_judge/gen_judgment.py index a1c70b295..7b1b18116 100644 --- a/fastchat/llm_judge/gen_judgment.py +++ b/fastchat/llm_judge/gen_judgment.py @@ -301,7 +301,7 @@ def make_judge_single(judge_model, judge_prompts): # Show match stats and prompt enter to continue print("Stats:") print(json.dumps(match_stat, indent=4)) - input("Press Enter to confirm...") + # input("Press Enter to confirm...") # Play matches if args.parallel == 1: diff --git a/fastchat/llm_judge/gen_model_answer.py b/fastchat/llm_judge/gen_model_answer.py index 3d093ecd5..8bbfd5446 100644 --- a/fastchat/llm_judge/gen_model_answer.py +++ b/fastchat/llm_judge/gen_model_answer.py @@ -15,10 +15,21 @@ from fastchat.llm_judge.common import load_questions, temperature_config from fastchat.model import load_model, get_conversation_template - +def str2bool(v): + """Convert string to boolean.""" + if isinstance(v, bool): + return v + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') def run_eval( model_path, + model_revision, + trust_remote_code, model_id, question_file, question_begin, @@ -51,6 +62,8 @@ def run_eval( ans_handles.append( get_answers_func( model_path, + model_revision, + trust_remote_code, model_id, questions[i : i + chunk_size], answer_file, @@ -68,6 +81,8 @@ def run_eval( @torch.inference_mode() def get_model_answers( model_path, + model_revision, + trust_remote_code, model_id, questions, answer_file, @@ -84,6 +99,8 @@ def get_model_answers( load_8bit=False, cpu_offloading=False, debug=False, + revision=model_revision, + trust_remote_code=trust_remote_code, ) for question in tqdm(questions): @@ -95,7 +112,7 @@ def get_model_answers( choices = [] for i in range(num_choices): torch.manual_seed(i) - conv = get_conversation_template(model_id) + conv = get_conversation_template(model_path) turns = [] for j in range(len(question["turns"])): qs = question["turns"][j] @@ -112,7 +129,7 @@ def get_model_answers( # some models may error out when generating long outputs try: output_ids = model.generate( - torch.as_tensor(input_ids).cuda(), + inputs=torch.as_tensor(input_ids).cuda(), do_sample=do_sample, temperature=temperature, max_new_tokens=max_new_token, @@ -192,6 +209,13 @@ def reorg_answer_file(answer_file): required=True, help="The path to the weights. This can be a local folder or a Hugging Face repo ID.", ) + parser.add_argument( + "--model-revision", + type=str, + default="main", + help="The revision of the model on the huggingface hub, default='main'", + ) + parser.add_argument("--trust-remote-code", type=str2bool, nargs='?', const=True, default=False, help="A boolean flag",) parser.add_argument("--model-id", type=str, required=True) parser.add_argument( "--bench-name", @@ -251,6 +275,8 @@ def reorg_answer_file(answer_file): run_eval( args.model_path, + args.model_revision, + args.trust_remote_code, args.model_id, question_file, args.question_begin, diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py index 8c2fbde32..555f66b74 100644 --- a/fastchat/model/model_adapter.py +++ b/fastchat/model/model_adapter.py @@ -23,8 +23,10 @@ LlamaTokenizer, LlamaForCausalLM, T5Tokenizer, + BitsAndBytesConfig ) - +from peft import PeftConfig, PeftModel +from huggingface_hub import list_repo_files from fastchat.constants import CPU_ISA from fastchat.modules.gptq import GptqConfig, load_gptq_quantized from fastchat.modules.awq import AWQConfig, load_awq_quantized @@ -40,6 +42,7 @@ replace_llama_attn_with_non_inplace_operations, ) from fastchat.utils import get_gpu_memory +from huggingface_hub.utils._validators import HFValidationError # Check an environment variable to check if we should be sharing Peft model # weights. When false we treat all Peft models as separate. @@ -47,6 +50,15 @@ os.environ.get("PEFT_SHARE_BASE_WEIGHTS", "false").lower() == "true" ) +def is_adapter_model(model_name_or_path: str, revision: str = "main") -> bool: + try: + # Try first if model on a Hub repo + repo_files = list_repo_files(model_name_or_path, revision=revision) + except HFValidationError: + # If not, check local repo + repo_files = os.listdir(model_name_or_path) + return "adapter_model.safetensors" in repo_files or "adapter_model.bin" in repo_files + class BaseModelAdapter: """The base and the default model adapter.""" @@ -88,7 +100,7 @@ def load_compress_model(self, model_path, device, torch_dtype, revision="main"): revision=revision, ) - def get_default_conv_template(self, model_path: str) -> Conversation: + def get_default_conv_template(self, model_path: str, revision: str = "main") -> Conversation: return get_conv_template("one_shot") @@ -103,8 +115,12 @@ def register_model_adapter(cls): @cache -def get_model_adapter(model_path: str) -> BaseModelAdapter: +def get_model_adapter(model_path: str, revision: str = "main") -> BaseModelAdapter: """Get a model adapter for a model_path.""" + if model_path not in ["gpt-4", "gpt-3.5-turbo", "claude-2", "claude-instant-1"] and is_adapter_model(model_path, revision=revision): + print(f"Adapter weights detected! Using PeftModelAdapter for {model_path=} and {revision=}") + return PeftModelAdapter() + model_path_basename = os.path.basename(os.path.normpath(model_path)) # Try the basename of model_path at first @@ -157,10 +173,14 @@ def load_model( awq_config: Optional[AWQConfig] = None, revision: str = "main", debug: bool = False, + trust_remote_code: bool=False, + base_model_revision: str ="main", + base_model_path: str = None, ): """Load a model from Hugging Face.""" # get model adapter - adapter = get_model_adapter(model_path) + adapter = get_model_adapter(model_path, revision=revision) + print(f"Using model adapter: {adapter.__class__.__name__} for {model_path=} and {revision=}") # Handle device mapping cpu_offloading = raise_warning_for_incompatible_cpu_offloading_configuration( @@ -180,6 +200,7 @@ def load_model( elif device == "cuda": kwargs = {"torch_dtype": torch.float16} if num_gpus != 1: + print(f"Sharding model across {num_gpus} GPUs") kwargs["device_map"] = "auto" if max_gpu_memory is None: kwargs[ @@ -231,6 +252,7 @@ def load_model( device=device, torch_dtype=kwargs["torch_dtype"], revision=revision, + trust_remote_code=trust_remote_code ) if debug: print(model) @@ -273,6 +295,10 @@ def load_model( model.to(device) return model, tokenizer kwargs["revision"] = revision + kwargs["trust_remote_code"] = trust_remote_code + if is_adapter_model(model_path, revision=revision) is True: + kwargs["base_model_revision"] = base_model_revision + kwargs["base_model_path"] = base_model_path # Load model model, tokenizer = adapter.load_model(model_path, kwargs) @@ -296,13 +322,15 @@ def load_model( if debug: print(model) + print(f"Model loaded on {model.device=} for {device=} and {num_gpus=}") + return model, tokenizer -def get_conversation_template(model_path: str) -> Conversation: +def get_conversation_template(model_path: str, revision: str = "main") -> Conversation: """Get the default conversation template.""" adapter = get_model_adapter(model_path) - return adapter.get_default_conv_template(model_path) + return adapter.get_default_conv_template(model_path, revision=revision) def get_generate_stream_function(model: torch.nn.Module, model_path: str): @@ -459,9 +487,12 @@ def match(self, model_path: str): def load_model(self, model_path: str, from_pretrained_kwargs: dict): """Loads the base model then the (peft) adapter weights""" from peft import PeftConfig, PeftModel - - config = PeftConfig.from_pretrained(model_path) - base_model_path = config.base_model_name_or_path + revision = from_pretrained_kwargs.get("revision", "main") + config = PeftConfig.from_pretrained(model_path, revision=revision) + if "base_model_path" in from_pretrained_kwargs and from_pretrained_kwargs["base_model_path"] is not None: + base_model_path = from_pretrained_kwargs["base_model_path"] + else: + base_model_path = config.base_model_name_or_path if "peft" in base_model_path: raise ValueError( f"PeftModelAdapter cannot load a base model with 'peft' in the name: {config.base_model_name_or_path}" @@ -492,31 +523,47 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict): # Super important: make sure we use model_path as the # `adapter_name`. model = PeftModel.from_pretrained( - base_model, model_path, adapter_name=model_path + base_model, model_path, adapter_name=model_path, revision=revision ) peft_model_cache[base_model_path] = (model, tokenizer) return model, tokenizer # In the normal case, load up the base model weights again. - base_adapter = get_model_adapter(base_model_path) + base_model_from_pretrained_kwargs = { + "revision": from_pretrained_kwargs.get("base_model_revision", "main"), + "trust_remote_code": from_pretrained_kwargs.get("trust_remote_code", False), + "device_map": from_pretrained_kwargs.get("device_map", "auto"), + "torch_dtype": from_pretrained_kwargs.get("torch_dtype", torch.float16), + } + base_adapter = get_model_adapter(base_model_path, revision=base_model_from_pretrained_kwargs["revision"]) + print(f"Loading base model for {base_model_path=} and {base_model_from_pretrained_kwargs=}") base_model, tokenizer = base_adapter.load_model( - base_model_path, from_pretrained_kwargs + base_model_path, base_model_from_pretrained_kwargs, ) - model = PeftModel.from_pretrained(base_model, model_path) + # If the base model is also a LoRA adapter, we need to merge those weights **before** loading the second adapter + # Without this, you will get garbage outputs! + if is_adapter_model(base_model_path, base_model_from_pretrained_kwargs["revision"]) is True: + print("Base model is adapter, merging LoRA weights") + base_model.eval() + base_model = base_model.merge_and_unload() + print(f"Base model loaded on device {base_model.device} for {base_model_path=} and {base_model_from_pretrained_kwargs=}") + model = PeftModel.from_pretrained(base_model, model_path, revision=revision) return model, tokenizer - def get_default_conv_template(self, model_path: str) -> Conversation: + def get_default_conv_template(self, model_path: str, revision: str = "main") -> Conversation: """Uses the conv template of the base model""" from peft import PeftConfig, PeftModel - config = PeftConfig.from_pretrained(model_path) + config = PeftConfig.from_pretrained(model_path, revision=revision) if "peft" in config.base_model_name_or_path: raise ValueError( f"PeftModelAdapter cannot load a base model with 'peft' in the name: {config.base_model_name_or_path}" ) base_model_path = config.base_model_name_or_path - base_adapter = get_model_adapter(base_model_path) - return base_adapter.get_default_conv_template(config.base_model_name_or_path) + base_adapter = get_model_adapter(base_model_path, revision=revision) + conv_template = base_adapter.get_default_conv_template(config.base_model_name_or_path, revision=revision) + print(f"Using chat template `{conv_template.name}` for {base_model_path=}") + return conv_template class VicunaAdapter(BaseModelAdapter): @@ -748,7 +795,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation: class PythiaAdapter(BaseModelAdapter): - """The model adapter for any EleutherAI/pythia model""" + """The model adapter for H4 Pythia models""" def match(self, model_path: str): return "pythia" in model_path.lower() @@ -759,6 +806,14 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict): model.config.pad_token_id = tokenizer.pad_token_id return model, tokenizer + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: + tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision) + # Legacy models did not have a chat template, so we default to the H4 template. + if tokenizer.chat_template is None or "<|im_start|>" not in tokenizer.chat_template: + return get_conv_template("h4_default_v3") + else: + return get_conv_template("chatml") + class StableLMAdapter(BaseModelAdapter): """The model adapter for StabilityAI/stablelm-tuned-alpha-7b""" @@ -889,7 +944,7 @@ def match(self, model_path: str): def load_model(self, model_path: str, from_pretrained_kwargs: dict): raise NotImplementedError() - def get_default_conv_template(self, model_path: str) -> Conversation: + def get_default_conv_template(self, model_path: str, revision: str = "main") -> Conversation: return get_conv_template("chatgpt") @@ -1007,7 +1062,7 @@ class WizardLMAdapter(BaseModelAdapter): def match(self, model_path: str): return "wizardlm" in model_path.lower() - def get_default_conv_template(self, model_path: str) -> Conversation: + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: model_path = model_path.lower() if "13b" in model_path or "30b" in model_path or "70b" in model_path: return get_conv_template("vicuna_v1.1") @@ -1096,12 +1151,12 @@ def match(self, model_path: str): def load_model(self, model_path: str, from_pretrained_kwargs: dict): revision = from_pretrained_kwargs.get("revision", "main") + # Strongly suggest using bf16, which is recommended by the author of Falcon - tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision) + tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_path, low_cpu_mem_usage=True, - trust_remote_code=True, **from_pretrained_kwargs, ) # In Falcon tokenizer config and special config there is not any pad token @@ -1110,7 +1165,7 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict): return model, tokenizer def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("falcon") + return get_conv_template("h4_default_v2") class TigerBotAdapter(BaseModelAdapter): @@ -1230,17 +1285,28 @@ class StarChatAdapter(BaseModelAdapter): """The model adapter for HuggingFaceH4/starchat-beta""" def match(self, model_path: str): - return "starchat" in model_path.lower() + return "starchat" in model_path.lower() and "starchat2" not in model_path.lower() - def get_default_conv_template(self, model_path: str) -> Conversation: + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: return get_conv_template("starchat") +class StarChat2Adapter(BaseModelAdapter): + """The model adapter for HuggingFaceH4/starchat2-v0.1""" + + def match(self, model_path: str): + return any( + model_str in model_path.lower() + for model_str in ["starchat2", "starcoder2"] + ) + + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: + return get_conv_template("chatml") class Llama2Adapter(BaseModelAdapter): """The model adapter for llama-2""" def match(self, model_path: str): - return "llama-2" in model_path.lower() + return "llama-2" in model_path.lower() or "llama2" in model_path.lower() def load_model(self, model_path: str, from_pretrained_kwargs: dict): model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) @@ -1248,9 +1314,185 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict): model.config.pad_token_id = tokenizer.pad_token_id return model, tokenizer - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("llama-2") + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: + tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision) + # Legacy models did not have a chat template, so we default to the H4 template. + if tokenizer.chat_template is None or "<|im_start|>" not in tokenizer.chat_template: + return get_conv_template("h4_default_v3") + else: + return get_conv_template("chatml") + +class MistralAdapter(BaseModelAdapter): + """The model adapter for mistral""" + + def match(self, model_path: str): + return "mistral" in model_path.lower() and "HuggingFaceH4" not in model_path + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) + model.config.eos_token_id = tokenizer.eos_token_id + model.config.pad_token_id = tokenizer.pad_token_id + return model, tokenizer + + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: + tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision) + # Legacy models did not have a chat template, so we default to the H4 template. + if tokenizer.chat_template is None or "<|im_start|>" not in tokenizer.chat_template: + return get_conv_template("h4_default_v3") + else: + return get_conv_template("chatml") + +class ZephyrAdapter(BaseModelAdapter): + """The model adapter for mistral""" + + def match(self, model_path: str): + return "zephyr" in model_path.lower() + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) + model.config.eos_token_id = tokenizer.eos_token_id + model.config.pad_token_id = tokenizer.pad_token_id + return model, tokenizer + + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: + tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision) + # Legacy models did not have a chat template, so we default to the H4 template. + if "gemma" in model_path.lower(): + return get_conv_template("gemma") + elif tokenizer.chat_template is None or "<|im_start|>" not in tokenizer.chat_template: + return get_conv_template("h4_default_v3") + else: + return get_conv_template("chatml") + +class H4GemmaAdapter(BaseModelAdapter): + """The model adapter for Gemma""" + + def match(self, model_path: str): + return "gemma" in model_path.lower() + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) + model.config.eos_token_id = tokenizer.eos_token_id + model.config.pad_token_id = tokenizer.pad_token_id + return model, tokenizer + + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: + return get_conv_template("gemma") + +class H4DeepSeekAdapter(BaseModelAdapter): + """The model adapter for H4 DeepSeek models""" + + def match(self, model_path: str): + return "deepseek" in model_path.lower() and "deepseek-coder" not in model_path.lower() + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) + model.config.eos_token_id = tokenizer.eos_token_id + model.config.pad_token_id = tokenizer.pad_token_id + return model, tokenizer + + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: + tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision) + # Legacy models did not have a chat template, so we default to the H4 template. + if tokenizer.chat_template is None or "<|im_start|>" not in tokenizer.chat_template: + return get_conv_template("h4_default_v3") + else: + return get_conv_template("chatml") + +class H4MixtralAdapter(BaseModelAdapter): + """The model adapter for H4 Mixtral models""" + + def match(self, model_path: str): + return "mixtral" in model_path.lower() + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) + model.config.eos_token_id = tokenizer.eos_token_id + model.config.pad_token_id = tokenizer.pad_token_id + return model, tokenizer + + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: + tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision) + # Legacy models did not have a chat template, so we default to the H4 template. + if tokenizer.chat_template is None or "<|im_start|>" not in tokenizer.chat_template: + return get_conv_template("h4_default_v3") + else: + return get_conv_template("chatml") + +class H4PhiAdapter(BaseModelAdapter): + """The model adapter for H4 Phi models""" + + def match(self, model_path: str): + return "phi" in model_path.lower() + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) + model.config.eos_token_id = tokenizer.eos_token_id + model.config.pad_token_id = tokenizer.pad_token_id + return model, tokenizer + + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: + tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision) + # Legacy models did not have a chat template, so we default to the H4 template. + if tokenizer.chat_template is None or "<|im_start|>" not in tokenizer.chat_template: + return get_conv_template("h4_default_v3") + else: + return get_conv_template("chatml") + +class H4Qwen2Adapter(BaseModelAdapter): + """The model adapter for H4 Qwen2 models""" + + def match(self, model_path: str): + return "qwen" in model_path.lower() and "HuggingFaceH4" in model_path.lower() + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) + model.config.eos_token_id = tokenizer.eos_token_id + model.config.pad_token_id = tokenizer.pad_token_id + return model, tokenizer + + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: + tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision) + # Legacy models did not have a chat template, so we default to the H4 template. + if tokenizer.chat_template is None or "<|im_start|>" not in tokenizer.chat_template: + return get_conv_template("h4_default_v3") + else: + return get_conv_template("chatml") + +class OrpoQwenAdapter(BaseModelAdapter): + """The model adapter for Orpo Qwen2 models""" + + def match(self, model_path: str): + return "qwen" in model_path.lower() and "orpo-explorers" in model_path.lower() + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) + model.config.eos_token_id = tokenizer.eos_token_id + model.config.pad_token_id = tokenizer.pad_token_id + return model, tokenizer + + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: + return get_conv_template("orpo-qwen") + +class OrpoLlamaAdapter(BaseModelAdapter): + """The model adapter for Orpo Llama models""" + + def match(self, model_path: str): + return "llama" in model_path.lower() and "orpo-explorers" in model_path.lower() + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) + model.config.eos_token_id = tokenizer.eos_token_id + model.config.pad_token_id = tokenizer.pad_token_id + return model, tokenizer + + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: + tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision) + # Legacy models did not have a chat template, so we default to the H4 template. + if tokenizer.chat_template is None or "<|im_start|>" not in tokenizer.chat_template: + return get_conv_template("h4_default_v3") + else: + return get_conv_template("chatml") class CuteGPTAdapter(BaseModelAdapter): """The model adapter for llama-2""" @@ -1575,9 +1817,66 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict): model.config.pad_token_id = tokenizer.pad_token_id return model, tokenizer - def get_default_conv_template(self, model_path: str) -> Conversation: + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: return get_conv_template("llama-2") +class Hermes2Adapter(BaseModelAdapter): + """Model adapter for teknium/OpenHermes-2.5-Mistral-7B and teknium/OpenHermes-2-Mistral-7B models""" + + use_fast_tokenizer = False + + def match(self, model_path: str): + return any( + model_str in model_path.lower() + for model_str in ["openhermes-2.5-mistral-7b", "openhermes-2-mistral-7b", "hermes-2-pro-mistral-7b"] + ) + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + revision = from_pretrained_kwargs.get("revision", "main") + tokenizer = AutoTokenizer.from_pretrained( + model_path, use_fast=self.use_fast_tokenizer, revision=revision + ) + model = AutoModelForCausalLM.from_pretrained( + model_path, + low_cpu_mem_usage=True, + **from_pretrained_kwargs, + ).eval() + return model, tokenizer + + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: + return get_conv_template("OpenHermes-2.5-Mistral-7B") + +class DeepseekCoderAdapter(BaseModelAdapter): + """The model adapter for deepseek-ai's coder models""" + + def match(self, model_path: str): + return "deepseek-coder" in model_path.lower() + + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: + return get_conv_template("deepseek-coder") + +class DBRXAdapter(BaseModelAdapter): + """The model adapter for DBRX models""" + + def match(self, model_path: str): + model_path = model_path.lower() + return "dbrx" in model_path and not "HuggingFaceH4" in model_path + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + revision = from_pretrained_kwargs.get("revision", "main") + model = AutoModelForCausalLM.from_pretrained( + model_path, + **from_pretrained_kwargs, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_path, trust_remote_code=True, revision=revision + ) + model.config.eos_token_id = tokenizer.eos_token_id + model.config.pad_token_id = tokenizer.pad_token_id + return model, tokenizer + + def get_default_conv_template(self, model_path: str, revision: str) -> Conversation: + return get_conv_template("dbrx") # Note: the registration order matters. # The one registered earlier has a higher matching priority. @@ -1622,11 +1921,12 @@ def get_default_conv_template(self, model_path: str) -> Conversation: register_model_adapter(PythiaAdapter) register_model_adapter(InternLMChatAdapter) register_model_adapter(StarChatAdapter) +register_model_adapter(StarChat2Adapter) register_model_adapter(Llama2Adapter) register_model_adapter(CuteGPTAdapter) register_model_adapter(OpenOrcaAdapter) register_model_adapter(WizardCoderAdapter) -register_model_adapter(QwenChatAdapter) +# register_model_adapter(QwenChatAdapter) register_model_adapter(AquilaChatAdapter) register_model_adapter(BGEAdapter) register_model_adapter(E5Adapter) @@ -1636,6 +1936,18 @@ def get_default_conv_template(self, model_path: str) -> Conversation: register_model_adapter(OpenLLaMaOpenInstructAdapter) register_model_adapter(ReaLMAdapter) register_model_adapter(CodeLlamaAdapter) +register_model_adapter(Hermes2Adapter) +register_model_adapter(MistralAdapter) +register_model_adapter(H4DeepSeekAdapter) +register_model_adapter(H4MixtralAdapter) +register_model_adapter(H4PhiAdapter) +register_model_adapter(ZephyrAdapter) +register_model_adapter(H4Qwen2Adapter) +register_model_adapter(H4GemmaAdapter) +register_model_adapter(DeepseekCoderAdapter) +register_model_adapter(DBRXAdapter) +register_model_adapter(OrpoQwenAdapter) +register_model_adapter(OrpoLlamaAdapter) # After all adapters, try the default base adapter. register_model_adapter(BaseModelAdapter) diff --git a/pyproject.toml b/pyproject.toml index 6c1d12f5e..f29386c44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" [project] -name = "fschat" +name = "fastchat" version = "0.2.26" description = "An open platform for training, serving, and evaluating large language model based chatbots." readme = "README.md" @@ -14,7 +14,7 @@ classifiers = [ ] dependencies = [ "aiohttp", "fastapi", "httpx", "markdown2[all]", "nh3", "numpy", - "prompt_toolkit>=3.0.0", "pydantic<2,>=1", "requests", "rich>=10.0.0", + "prompt_toolkit>=3.0.0", "pydantic<3,>=1", "requests", "rich>=10.0.0", "shortuuid", "tiktoken", "uvicorn", ]