diff --git a/.gitignore b/.gitignore index 94b6e614d..82276c686 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,6 @@ tests/state_of_the_union.txt # Build build + +# Data +fastchat/llm_judge/data/ \ No newline at end of file diff --git a/fastchat/llm_judge/README.md b/fastchat/llm_judge/README.md index 1d2646b13..990b1ce66 100644 --- a/fastchat/llm_judge/README.md +++ b/fastchat/llm_judge/README.md @@ -87,6 +87,32 @@ The judgments will be saved to `data/mt_bench/model_judgment/gpt-4_single.jsonl` --- +#### Run all steps together + +We provide a script that runs all steps together as follows: + +```shell +run.sh {HUB_MODEL_ID} {MT_BENCH_ID} {HUB_MODEL_REVISION} {DTYPE} +``` + +For example, to evaluate `zephyr-7b-beta` you can run: + +```shell +./run.sh HuggingFaceH4/zephyr-7b-beta zephyr-7b-beta +``` + +To evaluate a specific revision, run: + +```shell +./run.sh HuggingFaceH4/mistral-7b-dpo mistral-7b-dpo_v8.2 v8.2 +``` + +To evaluate a specific revision and dtype (`float16` is the default and recommended for most models), run: + +```shell +./run.sh HuggingFaceH4/mistral-7b-dpo mistral-7b-dpo_v8.2 v8.2 bfloat16 +``` + ### Other grading options Besides score-based single-answer grading, we also support two additional grading options based on win rates: - `pariwise-baseline`: run pairwise comparison against a baseline model. diff --git a/fastchat/llm_judge/gen_judgment.py b/fastchat/llm_judge/gen_judgment.py index a1c70b295..867a8208e 100644 --- a/fastchat/llm_judge/gen_judgment.py +++ b/fastchat/llm_judge/gen_judgment.py @@ -301,7 +301,6 @@ def make_judge_single(judge_model, judge_prompts): # Show match stats and prompt enter to continue print("Stats:") print(json.dumps(match_stat, indent=4)) - input("Press Enter to confirm...") # Play matches if args.parallel == 1: diff --git a/fastchat/llm_judge/gen_model_answer.py b/fastchat/llm_judge/gen_model_answer.py index be399750f..a4f4bd351 100644 --- a/fastchat/llm_judge/gen_model_answer.py +++ b/fastchat/llm_judge/gen_model_answer.py @@ -31,6 +31,8 @@ def run_eval( num_gpus_total, max_gpu_memory, dtype, + revision, + base_model_revision, ): questions = load_questions(question_file, question_begin, question_end) # random shuffle the questions to balance the loading @@ -61,6 +63,8 @@ def run_eval( num_gpus_per_model, max_gpu_memory, dtype=dtype, + revision=revision, + base_model_revision=base_model_revision, ) ) @@ -79,9 +83,13 @@ def get_model_answers( num_gpus_per_model, max_gpu_memory, dtype, + revision, + base_model_revision, ): model, tokenizer = load_model( model_path, + revision=revision, + base_model_revision=base_model_revision, device="cuda", num_gpus=num_gpus_per_model, max_gpu_memory=max_gpu_memory, @@ -100,7 +108,8 @@ def get_model_answers( choices = [] for i in range(num_choices): torch.manual_seed(i) - conv = get_conversation_template(model_id) + conv = get_conversation_template(model_path) + print(f"Using chat template `{conv.name}` to generate answers") turns = [] for j in range(len(question["turns"])): qs = question["turns"][j] @@ -117,7 +126,7 @@ def get_model_answers( # some models may error out when generating long outputs try: output_ids = model.generate( - torch.as_tensor(input_ids).cuda(), + inputs=torch.as_tensor(input_ids).cuda(), do_sample=do_sample, temperature=temperature, max_new_tokens=max_new_token, @@ -225,6 +234,7 @@ def reorg_answer_file(answer_file): parser.add_argument( "--question-end", type=int, help="A debug option. The end index of questions." ) + parser.add_argument("--question-file", type=str, help="The input question file.") parser.add_argument("--answer-file", type=str, help="The output answer file.") parser.add_argument( "--max-new-token", @@ -259,6 +269,18 @@ def reorg_answer_file(answer_file): help="Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.", default=None, ) + parser.add_argument( + "--revision", + type=str, + default="main", + help="The revision of the model on the Hugging Face Hub.", + ) + parser.add_argument( + "--base-model-revision", + type=str, + default="main", + help="The revision of the base model for PEFT adapters.", + ) args = parser.parse_args() @@ -267,7 +289,10 @@ def reorg_answer_file(answer_file): ray.init() - question_file = f"data/{args.bench_name}/question.jsonl" + if args.question_file: + question_file = args.question_file + else: + question_file = f"data/{args.bench_name}/question.jsonl" if args.answer_file: answer_file = args.answer_file else: @@ -288,6 +313,8 @@ def reorg_answer_file(answer_file): num_gpus_total=args.num_gpus_total, max_gpu_memory=args.max_gpu_memory, dtype=str_to_torch_dtype(args.dtype), + revision=args.revision, + base_model_revision=args.base_model_revision, ) reorg_answer_file(answer_file) diff --git a/fastchat/llm_judge/run.sh b/fastchat/llm_judge/run.sh new file mode 100755 index 000000000..78cc88d1a --- /dev/null +++ b/fastchat/llm_judge/run.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -x -e + +HUB_MODEL_ID=$1 +MT_BENCH_ID=$2 +[ -z "$3" ] && REVISION="main" || REVISION=$3 +[ -z "$4" ] && DTYPE="float16" || DTYPE=$4 + +# Generate answer +python gen_model_answer.py --model-path $HUB_MODEL_ID --revision $REVISION --dtype $DTYPE --model-id $MT_BENCH_ID + +# Judge! +python gen_judgment.py --model-list $MT_BENCH_ID + +# Get results +python show_result.py \ No newline at end of file diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py index ee4fe573d..05a7ca2d8 100644 --- a/fastchat/model/model_adapter.py +++ b/fastchat/model/model_adapter.py @@ -42,7 +42,7 @@ from fastchat.modules.exllama import ExllamaConfig, load_exllama_model from fastchat.modules.xfastertransformer import load_xft_model, XftConfig from fastchat.modules.gptq import GptqConfig, load_gptq_quantized -from fastchat.utils import get_gpu_memory +from fastchat.utils import get_gpu_memory, is_adapter_model # Check an environment variable to check if we should be sharing Peft model # weights. When false we treat all Peft models as separate. @@ -56,6 +56,13 @@ "claude-instant-1", ) +OPENAI_MODEL_LIST = ( + "gpt-4", + "gpt-3.5-turbo", +) + +JUDGE_MODEL_LIST = ANTHROPIC_MODEL_LIST + OPENAI_MODEL_LIST + class BaseModelAdapter: """The base and the default model adapter.""" @@ -118,8 +125,16 @@ def register_model_adapter(cls): @cache -def get_model_adapter(model_path: str) -> BaseModelAdapter: +def get_model_adapter(model_path: str, revision: str = "main") -> BaseModelAdapter: """Get a model adapter for a model_path.""" + + # Exclude judge LLMs from the model adapter list + if ( + model_path not in JUDGE_MODEL_LIST + and is_adapter_model(model_path, revision=revision) is True + ): + return PeftModelAdapter() + model_path_basename = os.path.basename(os.path.normpath(model_path)) # Try the basename of model_path at first @@ -174,11 +189,15 @@ def load_model( exllama_config: Optional[ExllamaConfig] = None, xft_config: Optional[XftConfig] = None, revision: str = "main", + base_model_revision: str = "main", debug: bool = False, ): """Load a model from Hugging Face.""" # get model adapter - adapter = get_model_adapter(model_path) + adapter = get_model_adapter(model_path, revision=revision) + print( + f"Using model adapter: {adapter.__class__.__name__} for {model_path=} and {revision=}" + ) # Handle device mapping cpu_offloading = raise_warning_for_incompatible_cpu_offloading_configuration( @@ -305,6 +324,9 @@ def load_model( return model, tokenizer kwargs["revision"] = revision + if is_adapter_model(model_path, revision=revision) is True: + kwargs["base_model_revision"] = base_model_revision + if dtype is not None: # Overwrite dtype if it is provided in the arguments. kwargs["torch_dtype"] = dtype @@ -541,7 +563,7 @@ class PeftModelAdapter: def match(self, model_path: str): """Accepts any model path with "peft" in the name""" - if os.path.exists(os.path.join(model_path, "adapter_config.json")): + if is_adapter_model(model_path): return True return "peft" in model_path.lower() @@ -549,7 +571,8 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict): """Loads the base model then the (peft) adapter weights""" from peft import PeftConfig, PeftModel - config = PeftConfig.from_pretrained(model_path) + revision = from_pretrained_kwargs.get("revision", "main") + config = PeftConfig.from_pretrained(model_path, revision=revision) base_model_path = config.base_model_name_or_path if "peft" in base_model_path: raise ValueError( @@ -581,17 +604,23 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict): # Super important: make sure we use model_path as the # `adapter_name`. model = PeftModel.from_pretrained( - base_model, model_path, adapter_name=model_path + base_model, model_path, adapter_name=model_path, revision=revision ) peft_model_cache[base_model_path] = (model, tokenizer) return model, tokenizer # In the normal case, load up the base model weights again. base_adapter = get_model_adapter(base_model_path) + + # h4: we override the `revision` arg to point to the revision of the base model instead of the adapter one. + from_pretrained_kwargs["revision"] = from_pretrained_kwargs.get( + "base_model_revision", "main" + ) + from_pretrained_kwargs.pop("base_model_revision", None) base_model, tokenizer = base_adapter.load_model( base_model_path, from_pretrained_kwargs ) - model = PeftModel.from_pretrained(base_model, model_path) + model = PeftModel.from_pretrained(base_model, model_path, revision=revision) return model, tokenizer def get_default_conv_template(self, model_path: str) -> Conversation: @@ -1481,9 +1510,9 @@ class Hermes2Adapter(BaseModelAdapter): use_fast_tokenizer = False def match(self, model_path: str): - return ( - "openhermes-2.5-mistral-7b" - or "openhermes-2-mistral-7b" in model_path.lower() + return any( + model_str in model_path.lower() + for model_str in ["openhermes-2.5-mistral-7b", "openhermes-2-mistral-7b"] ) def load_model(self, model_path: str, from_pretrained_kwargs: dict): @@ -1922,6 +1951,22 @@ def get_default_conv_template(self, model_path: str) -> Conversation: return get_conv_template("Yi-34b-chat") +############# +# H4 Adapters +############# +class H4MistralAdapter(BaseModelAdapter): + """The model adapter for H4 Mistral models""" + + def match(self, model_path: str): + return "mistral" in model_path.lower() + + def get_default_conv_template(self, model_path: str) -> Conversation: + return get_conv_template("zephyr") + + +# Register our adapters first to prioritise over defaults +register_model_adapter(H4MistralAdapter) + # Note: the registration order matters. # The one registered earlier has a higher matching priority. register_model_adapter(PeftModelAdapter) @@ -1994,5 +2039,6 @@ def get_default_conv_template(self, model_path: str) -> Conversation: register_model_adapter(MicrosoftOrcaAdapter) register_model_adapter(YiAdapter) + # After all adapters, try the default base adapter. register_model_adapter(BaseModelAdapter) diff --git a/fastchat/utils.py b/fastchat/utils.py index b5e3ba543..90a66cf4d 100644 --- a/fastchat/utils.py +++ b/fastchat/utils.py @@ -10,6 +10,8 @@ import sys from typing import AsyncGenerator, Generator import warnings +from huggingface_hub import list_repo_files +from huggingface_hub.utils._validators import HFValidationError import requests @@ -347,3 +349,15 @@ def str_to_torch_dtype(dtype: str): return torch.bfloat16 else: raise ValueError(f"Unrecognized dtype: {dtype}") + + +def is_adapter_model(model_name_or_path: str, revision: str = "main") -> bool: + try: + # Try first if model on a Hub repo + repo_files = list_repo_files(model_name_or_path, revision=revision) + except HFValidationError: + # If not, check local repo + repo_files = os.listdir(model_name_or_path) + return ( + "adapter_model.safetensors" in repo_files or "adapter_model.bin" in repo_files + ) diff --git a/pyproject.toml b/pyproject.toml index b6db03490..ee9ee7404 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ model_worker = ["accelerate>=0.21", "peft", "sentencepiece", "torch", "transformers>=4.31.0", "protobuf"] webui = ["gradio"] train = ["einops", "flash-attn>=2.0", "wandb"] -llm_judge = ["openai<1", "anthropic>=0.3", "ray"] +llm_judge = ["openai<=0.28.0", "anthropic>=0.3", "ray", "pandas"] dev = ["black==23.3.0", "pylint==2.8.2"] [project.urls]