huggingface · lewtun · Oct 25, 2023 · Oct 25, 2023 · Nov 23, 2023 · Nov 23, 2023
diff --git a/.gitignore b/.gitignore
@@ -29,3 +29,6 @@ tests/state_of_the_union.txt
 
 # Build
 build
+
+# Data
+fastchat/llm_judge/data/
diff --git a/fastchat/llm_judge/README.md b/fastchat/llm_judge/README.md
@@ -87,6 +87,32 @@ The judgments will be saved to `data/mt_bench/model_judgment/gpt-4_single.jsonl`
 
 ---
 
+#### Run all steps together
+
+We provide a script that runs all steps together as follows:
+
+```shell
+run.sh {HUB_MODEL_ID} {MT_BENCH_ID} {HUB_MODEL_REVISION} {DTYPE}
+```
+
+For example, to evaluate `zephyr-7b-beta` you can run:
+
+```shell
+./run.sh HuggingFaceH4/zephyr-7b-beta zephyr-7b-beta 
+```
+
+To evaluate a specific revision, run:
+
+```shell
+./run.sh HuggingFaceH4/mistral-7b-dpo mistral-7b-dpo_v8.2 v8.2
+```
+
+To evaluate a specific revision and dtype (`float16` is the default and recommended for most models), run:
+
+```shell
+./run.sh HuggingFaceH4/mistral-7b-dpo mistral-7b-dpo_v8.2 v8.2 bfloat16
+```
+
 ### Other grading options
 Besides score-based single-answer grading, we also support two additional grading options based on win rates:
 - `pariwise-baseline`: run pairwise comparison against a baseline model.

diff --git a/fastchat/llm_judge/gen_judgment.py b/fastchat/llm_judge/gen_judgment.py
@@ -301,7 +301,6 @@ def make_judge_single(judge_model, judge_prompts):
     # Show match stats and prompt enter to continue
     print("Stats:")
     print(json.dumps(match_stat, indent=4))
-    input("Press Enter to confirm...")
 
     # Play matches
     if args.parallel == 1:

diff --git a/fastchat/llm_judge/gen_model_answer.py b/fastchat/llm_judge/gen_model_answer.py
@@ -31,6 +31,8 @@ def run_eval(
     num_gpus_total,
     max_gpu_memory,
     dtype,
+    revision,
+    base_model_revision,
 ):
     questions = load_questions(question_file, question_begin, question_end)
     # random shuffle the questions to balance the loading
@@ -61,6 +63,8 @@ def run_eval(
                 num_gpus_per_model,
                 max_gpu_memory,
                 dtype=dtype,
+                revision=revision,
+                base_model_revision=base_model_revision,
             )
         )
 
@@ -79,9 +83,13 @@ def get_model_answers(
     num_gpus_per_model,
     max_gpu_memory,
     dtype,
+    revision,
+    base_model_revision,
 ):
     model, tokenizer = load_model(
         model_path,
+        revision=revision,
+        base_model_revision=base_model_revision,
         device="cuda",
         num_gpus=num_gpus_per_model,
         max_gpu_memory=max_gpu_memory,
@@ -100,7 +108,8 @@ def get_model_answers(
         choices = []
         for i in range(num_choices):
             torch.manual_seed(i)
-            conv = get_conversation_template(model_id)
+            conv = get_conversation_template(model_path)
+            print(f"Using chat template `{conv.name}` to generate answers")
             turns = []
             for j in range(len(question["turns"])):
                 qs = question["turns"][j]
@@ -117,7 +126,7 @@ def get_model_answers(
                 # some models may error out when generating long outputs
                 try:
                     output_ids = model.generate(
-                        torch.as_tensor(input_ids).cuda(),
+                        inputs=torch.as_tensor(input_ids).cuda(),
                         do_sample=do_sample,
                         temperature=temperature,
                         max_new_tokens=max_new_token,
@@ -225,6 +234,7 @@ def reorg_answer_file(answer_file):
     parser.add_argument(
         "--question-end", type=int, help="A debug option. The end index of questions."
     )
+    parser.add_argument("--question-file", type=str, help="The input question file.")
     parser.add_argument("--answer-file", type=str, help="The output answer file.")
     parser.add_argument(
         "--max-new-token",
@@ -259,6 +269,18 @@ def reorg_answer_file(answer_file):
         help="Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.",
         default=None,
     )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default="main",
+        help="The revision of the model on the Hugging Face Hub.",
+    )
+    parser.add_argument(
+        "--base-model-revision",
+        type=str,
+        default="main",
+        help="The revision of the base model for PEFT adapters.",
+    )
 
     args = parser.parse_args()
 
@@ -267,7 +289,10 @@ def reorg_answer_file(answer_file):
 
         ray.init()
 
-    question_file = f"data/{args.bench_name}/question.jsonl"
+    if args.question_file:
+        question_file = args.question_file
+    else:
+        question_file = f"data/{args.bench_name}/question.jsonl"
     if args.answer_file:
         answer_file = args.answer_file
     else:
@@ -288,6 +313,8 @@ def reorg_answer_file(answer_file):
         num_gpus_total=args.num_gpus_total,
         max_gpu_memory=args.max_gpu_memory,
         dtype=str_to_torch_dtype(args.dtype),
+        revision=args.revision,
+        base_model_revision=args.base_model_revision,
     )
 
     reorg_answer_file(answer_file)
diff --git a/fastchat/llm_judge/run.sh b/fastchat/llm_judge/run.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+set -x -e
+
+HUB_MODEL_ID=$1
+MT_BENCH_ID=$2
+[ -z "$3" ] && REVISION="main" || REVISION=$3
+[ -z "$4" ] && DTYPE="float16" || DTYPE=$4
+
+# Generate answer
+python gen_model_answer.py --model-path $HUB_MODEL_ID --revision $REVISION --dtype $DTYPE --model-id $MT_BENCH_ID
+
+# Judge!
+python gen_judgment.py --model-list $MT_BENCH_ID
+
+# Get results
+python show_result.py
diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
@@ -42,7 +42,7 @@
 from fastchat.modules.exllama import ExllamaConfig, load_exllama_model
 from fastchat.modules.xfastertransformer import load_xft_model, XftConfig
 from fastchat.modules.gptq import GptqConfig, load_gptq_quantized
-from fastchat.utils import get_gpu_memory
+from fastchat.utils import get_gpu_memory, is_adapter_model
 
 # Check an environment variable to check if we should be sharing Peft model
 # weights.  When false we treat all Peft models as separate.
@@ -56,6 +56,13 @@
     "claude-instant-1",
 )
 
+OPENAI_MODEL_LIST = (
+    "gpt-4",
+    "gpt-3.5-turbo",
+)
+
+JUDGE_MODEL_LIST = ANTHROPIC_MODEL_LIST + OPENAI_MODEL_LIST
+
 
 class BaseModelAdapter:
     """The base and the default model adapter."""
@@ -118,8 +125,16 @@ def register_model_adapter(cls):
 
 
 @cache
-def get_model_adapter(model_path: str) -> BaseModelAdapter:
+def get_model_adapter(model_path: str, revision: str = "main") -> BaseModelAdapter:
     """Get a model adapter for a model_path."""
+
+    # Exclude judge LLMs from the model adapter list
+    if (
+        model_path not in JUDGE_MODEL_LIST
+        and is_adapter_model(model_path, revision=revision) is True
+    ):
+        return PeftModelAdapter()
+
     model_path_basename = os.path.basename(os.path.normpath(model_path))
 
     # Try the basename of model_path at first
@@ -174,11 +189,15 @@ def load_model(
     exllama_config: Optional[ExllamaConfig] = None,
     xft_config: Optional[XftConfig] = None,
     revision: str = "main",
+    base_model_revision: str = "main",
     debug: bool = False,
 ):
     """Load a model from Hugging Face."""
     # get model adapter
-    adapter = get_model_adapter(model_path)
+    adapter = get_model_adapter(model_path, revision=revision)
+    print(
+        f"Using model adapter: {adapter.__class__.__name__} for {model_path=} and {revision=}"
+    )
 
     # Handle device mapping
     cpu_offloading = raise_warning_for_incompatible_cpu_offloading_configuration(
@@ -305,6 +324,9 @@ def load_model(
         return model, tokenizer
     kwargs["revision"] = revision
 
+    if is_adapter_model(model_path, revision=revision) is True:
+        kwargs["base_model_revision"] = base_model_revision
+
     if dtype is not None:  # Overwrite dtype if it is provided in the arguments.
         kwargs["torch_dtype"] = dtype
 
@@ -541,15 +563,16 @@ class PeftModelAdapter:
 
     def match(self, model_path: str):
         """Accepts any model path with "peft" in the name"""
-        if os.path.exists(os.path.join(model_path, "adapter_config.json")):
+        if is_adapter_model(model_path):
             return True
         return "peft" in model_path.lower()
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         """Loads the base model then the (peft) adapter weights"""
         from peft import PeftConfig, PeftModel
 
-        config = PeftConfig.from_pretrained(model_path)
+        revision = from_pretrained_kwargs.get("revision", "main")
+        config = PeftConfig.from_pretrained(model_path, revision=revision)
         base_model_path = config.base_model_name_or_path
         if "peft" in base_model_path:
             raise ValueError(
@@ -581,17 +604,23 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
                 # Super important: make sure we use model_path as the
                 # `adapter_name`.
                 model = PeftModel.from_pretrained(
-                    base_model, model_path, adapter_name=model_path
+                    base_model, model_path, adapter_name=model_path, revision=revision
                 )
                 peft_model_cache[base_model_path] = (model, tokenizer)
             return model, tokenizer
 
         # In the normal case, load up the base model weights again.
         base_adapter = get_model_adapter(base_model_path)
+
+        # h4: we override the `revision` arg to point to the revision of the base model instead of the adapter one.
+        from_pretrained_kwargs["revision"] = from_pretrained_kwargs.get(
+            "base_model_revision", "main"
+        )
+        from_pretrained_kwargs.pop("base_model_revision", None)
         base_model, tokenizer = base_adapter.load_model(
             base_model_path, from_pretrained_kwargs
         )
-        model = PeftModel.from_pretrained(base_model, model_path)
+        model = PeftModel.from_pretrained(base_model, model_path, revision=revision)
         return model, tokenizer
 
     def get_default_conv_template(self, model_path: str) -> Conversation:
@@ -1481,9 +1510,9 @@ class Hermes2Adapter(BaseModelAdapter):
     use_fast_tokenizer = False
 
     def match(self, model_path: str):
-        return (
-            "openhermes-2.5-mistral-7b"
-            or "openhermes-2-mistral-7b" in model_path.lower()
+        return any(
+            model_str in model_path.lower()
+            for model_str in ["openhermes-2.5-mistral-7b", "openhermes-2-mistral-7b"]
         )
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
@@ -1922,6 +1951,22 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
         return get_conv_template("Yi-34b-chat")
 
 
+#############
+# H4 Adapters
+#############
+class H4MistralAdapter(BaseModelAdapter):
+    """The model adapter for H4 Mistral models"""
+
+    def match(self, model_path: str):
+        return "mistral" in model_path.lower()
+
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("zephyr")
+
+
+# Register our adapters first to prioritise over defaults
+register_model_adapter(H4MistralAdapter)
+
 # Note: the registration order matters.
 # The one registered earlier has a higher matching priority.
 register_model_adapter(PeftModelAdapter)
@@ -1994,5 +2039,6 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 register_model_adapter(MicrosoftOrcaAdapter)
 register_model_adapter(YiAdapter)
 
+
 # After all adapters, try the default base adapter.
 register_model_adapter(BaseModelAdapter)
diff --git a/fastchat/utils.py b/fastchat/utils.py
@@ -10,6 +10,8 @@
 import sys
 from typing import AsyncGenerator, Generator
 import warnings
+from huggingface_hub import list_repo_files
+from huggingface_hub.utils._validators import HFValidationError
 
 import requests
 
@@ -347,3 +349,15 @@ def str_to_torch_dtype(dtype: str):
         return torch.bfloat16
     else:
         raise ValueError(f"Unrecognized dtype: {dtype}")
+
+
+def is_adapter_model(model_name_or_path: str, revision: str = "main") -> bool:
+    try:
+        # Try first if model on a Hub repo
+        repo_files = list_repo_files(model_name_or_path, revision=revision)
+    except HFValidationError:
+        # If not, check local repo
+        repo_files = os.listdir(model_name_or_path)
+    return (
+        "adapter_model.safetensors" in repo_files or "adapter_model.bin" in repo_files
+    )
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
 model_worker = ["accelerate>=0.21", "peft", "sentencepiece", "torch", "transformers>=4.31.0", "protobuf"]
 webui = ["gradio"]
 train = ["einops", "flash-attn>=2.0", "wandb"]
-llm_judge = ["openai<1", "anthropic>=0.3", "ray"]
+llm_judge = ["openai<=0.28.0", "anthropic>=0.3", "ray", "pandas"]
 dev = ["black==23.3.0", "pylint==2.8.2"]
 
 [project.urls]
-Original file line number
+Diff line change
@@ Expand Up / @@ -29,3 +29,6 @@ tests/state_of_the_union.txt @@
     # Build
     build
+    # Data
+    fastchat/llm_judge/data/