Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,6 @@ tests/state_of_the_union.txt

# Build
build

# Data
fastchat/llm_judge/data/
26 changes: 26 additions & 0 deletions fastchat/llm_judge/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,32 @@ The judgments will be saved to `data/mt_bench/model_judgment/gpt-4_single.jsonl`

---

#### Run all steps together
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note for internal use - not to be upstreamed


We provide a script that runs all steps together as follows:

```shell
run.sh {HUB_MODEL_ID} {MT_BENCH_ID} {HUB_MODEL_REVISION} {DTYPE}
```

For example, to evaluate `zephyr-7b-beta` you can run:

```shell
./run.sh HuggingFaceH4/zephyr-7b-beta zephyr-7b-beta
```

To evaluate a specific revision, run:

```shell
./run.sh HuggingFaceH4/mistral-7b-dpo mistral-7b-dpo_v8.2 v8.2
```

To evaluate a specific revision and dtype (`float16` is the default and recommended for most models), run:

```shell
./run.sh HuggingFaceH4/mistral-7b-dpo mistral-7b-dpo_v8.2 v8.2 bfloat16
```

### Other grading options
Besides score-based single-answer grading, we also support two additional grading options based on win rates:
- `pariwise-baseline`: run pairwise comparison against a baseline model.
Expand Down
1 change: 0 additions & 1 deletion fastchat/llm_judge/gen_judgment.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,6 @@ def make_judge_single(judge_model, judge_prompts):
# Show match stats and prompt enter to continue
print("Stats:")
print(json.dumps(match_stat, indent=4))
input("Press Enter to confirm...")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed so we can run evals without confirmation


# Play matches
if args.parallel == 1:
Expand Down
33 changes: 30 additions & 3 deletions fastchat/llm_judge/gen_model_answer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def run_eval(
num_gpus_total,
max_gpu_memory,
dtype,
revision,
base_model_revision,
):
questions = load_questions(question_file, question_begin, question_end)
# random shuffle the questions to balance the loading
Expand Down Expand Up @@ -61,6 +63,8 @@ def run_eval(
num_gpus_per_model,
max_gpu_memory,
dtype=dtype,
revision=revision,
base_model_revision=base_model_revision,
)
)

Expand All @@ -79,9 +83,13 @@ def get_model_answers(
num_gpus_per_model,
max_gpu_memory,
dtype,
revision,
base_model_revision,
):
model, tokenizer = load_model(
model_path,
revision=revision,
base_model_revision=base_model_revision,
device="cuda",
num_gpus=num_gpus_per_model,
max_gpu_memory=max_gpu_memory,
Expand All @@ -100,7 +108,8 @@ def get_model_answers(
choices = []
for i in range(num_choices):
torch.manual_seed(i)
conv = get_conversation_template(model_id)
conv = get_conversation_template(model_path)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is needed because the model_id is often not identical to the model path and it conflicts with is_adapter_model() which tries to look up the string on the Hub

print(f"Using chat template `{conv.name}` to generate answers")
turns = []
for j in range(len(question["turns"])):
qs = question["turns"][j]
Expand All @@ -117,7 +126,7 @@ def get_model_answers(
# some models may error out when generating long outputs
try:
output_ids = model.generate(
torch.as_tensor(input_ids).cuda(),
inputs=torch.as_tensor(input_ids).cuda(),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needed for PeftForCausalLM

do_sample=do_sample,
temperature=temperature,
max_new_tokens=max_new_token,
Expand Down Expand Up @@ -225,6 +234,7 @@ def reorg_answer_file(answer_file):
parser.add_argument(
"--question-end", type=int, help="A debug option. The end index of questions."
)
parser.add_argument("--question-file", type=str, help="The input question file.")
parser.add_argument("--answer-file", type=str, help="The output answer file.")
parser.add_argument(
"--max-new-token",
Expand Down Expand Up @@ -259,6 +269,18 @@ def reorg_answer_file(answer_file):
help="Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.",
default=None,
)
parser.add_argument(
"--revision",
type=str,
default="main",
help="The revision of the model on the Hugging Face Hub.",
)
parser.add_argument(
"--base-model-revision",
type=str,
default="main",
help="The revision of the base model for PEFT adapters.",
)

args = parser.parse_args()

Expand All @@ -267,7 +289,10 @@ def reorg_answer_file(answer_file):

ray.init()

question_file = f"data/{args.bench_name}/question.jsonl"
if args.question_file:
question_file = args.question_file
else:
question_file = f"data/{args.bench_name}/question.jsonl"
if args.answer_file:
answer_file = args.answer_file
else:
Expand All @@ -288,6 +313,8 @@ def reorg_answer_file(answer_file):
num_gpus_total=args.num_gpus_total,
max_gpu_memory=args.max_gpu_memory,
dtype=str_to_torch_dtype(args.dtype),
revision=args.revision,
base_model_revision=args.base_model_revision,
)

reorg_answer_file(answer_file)
17 changes: 17 additions & 0 deletions fastchat/llm_judge/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

set -x -e

HUB_MODEL_ID=$1
MT_BENCH_ID=$2
[ -z "$3" ] && REVISION="main" || REVISION=$3
[ -z "$4" ] && DTYPE="float16" || DTYPE=$4

# Generate answer
python gen_model_answer.py --model-path $HUB_MODEL_ID --revision $REVISION --dtype $DTYPE --model-id $MT_BENCH_ID

# Judge!
python gen_judgment.py --model-list $MT_BENCH_ID

# Get results
python show_result.py
66 changes: 56 additions & 10 deletions fastchat/model/model_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from fastchat.modules.exllama import ExllamaConfig, load_exllama_model
from fastchat.modules.xfastertransformer import load_xft_model, XftConfig
from fastchat.modules.gptq import GptqConfig, load_gptq_quantized
from fastchat.utils import get_gpu_memory
from fastchat.utils import get_gpu_memory, is_adapter_model

# Check an environment variable to check if we should be sharing Peft model
# weights. When false we treat all Peft models as separate.
Expand All @@ -56,6 +56,13 @@
"claude-instant-1",
)

OPENAI_MODEL_LIST = (
"gpt-4",
"gpt-3.5-turbo",
)

JUDGE_MODEL_LIST = ANTHROPIC_MODEL_LIST + OPENAI_MODEL_LIST


class BaseModelAdapter:
"""The base and the default model adapter."""
Expand Down Expand Up @@ -118,8 +125,16 @@ def register_model_adapter(cls):


@cache
def get_model_adapter(model_path: str) -> BaseModelAdapter:
def get_model_adapter(model_path: str, revision: str = "main") -> BaseModelAdapter:
"""Get a model adapter for a model_path."""

# Exclude judge LLMs from the model adapter list
if (
model_path not in JUDGE_MODEL_LIST
and is_adapter_model(model_path, revision=revision) is True
):
return PeftModelAdapter()

model_path_basename = os.path.basename(os.path.normpath(model_path))

# Try the basename of model_path at first
Expand Down Expand Up @@ -174,11 +189,15 @@ def load_model(
exllama_config: Optional[ExllamaConfig] = None,
xft_config: Optional[XftConfig] = None,
revision: str = "main",
base_model_revision: str = "main",
debug: bool = False,
):
"""Load a model from Hugging Face."""
# get model adapter
adapter = get_model_adapter(model_path)
adapter = get_model_adapter(model_path, revision=revision)
print(
f"Using model adapter: {adapter.__class__.__name__} for {model_path=} and {revision=}"
)

# Handle device mapping
cpu_offloading = raise_warning_for_incompatible_cpu_offloading_configuration(
Expand Down Expand Up @@ -305,6 +324,9 @@ def load_model(
return model, tokenizer
kwargs["revision"] = revision

if is_adapter_model(model_path, revision=revision) is True:
kwargs["base_model_revision"] = base_model_revision

if dtype is not None: # Overwrite dtype if it is provided in the arguments.
kwargs["torch_dtype"] = dtype

Expand Down Expand Up @@ -541,15 +563,16 @@ class PeftModelAdapter:

def match(self, model_path: str):
"""Accepts any model path with "peft" in the name"""
if os.path.exists(os.path.join(model_path, "adapter_config.json")):
if is_adapter_model(model_path):
return True
return "peft" in model_path.lower()

def load_model(self, model_path: str, from_pretrained_kwargs: dict):
"""Loads the base model then the (peft) adapter weights"""
from peft import PeftConfig, PeftModel

config = PeftConfig.from_pretrained(model_path)
revision = from_pretrained_kwargs.get("revision", "main")
config = PeftConfig.from_pretrained(model_path, revision=revision)
base_model_path = config.base_model_name_or_path
if "peft" in base_model_path:
raise ValueError(
Expand Down Expand Up @@ -581,17 +604,23 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
# Super important: make sure we use model_path as the
# `adapter_name`.
model = PeftModel.from_pretrained(
base_model, model_path, adapter_name=model_path
base_model, model_path, adapter_name=model_path, revision=revision
)
peft_model_cache[base_model_path] = (model, tokenizer)
return model, tokenizer

# In the normal case, load up the base model weights again.
base_adapter = get_model_adapter(base_model_path)

# h4: we override the `revision` arg to point to the revision of the base model instead of the adapter one.
from_pretrained_kwargs["revision"] = from_pretrained_kwargs.get(
"base_model_revision", "main"
)
from_pretrained_kwargs.pop("base_model_revision", None)
base_model, tokenizer = base_adapter.load_model(
base_model_path, from_pretrained_kwargs
)
model = PeftModel.from_pretrained(base_model, model_path)
model = PeftModel.from_pretrained(base_model, model_path, revision=revision)
return model, tokenizer

def get_default_conv_template(self, model_path: str) -> Conversation:
Expand Down Expand Up @@ -1481,9 +1510,9 @@ class Hermes2Adapter(BaseModelAdapter):
use_fast_tokenizer = False

def match(self, model_path: str):
return (
"openhermes-2.5-mistral-7b"
or "openhermes-2-mistral-7b" in model_path.lower()
return any(
model_str in model_path.lower()
for model_str in ["openhermes-2.5-mistral-7b", "openhermes-2-mistral-7b"]
)

def load_model(self, model_path: str, from_pretrained_kwargs: dict):
Expand Down Expand Up @@ -1922,6 +1951,22 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("Yi-34b-chat")


#############
# H4 Adapters
#############
class H4MistralAdapter(BaseModelAdapter):
"""The model adapter for H4 Mistral models"""

def match(self, model_path: str):
return "mistral" in model_path.lower()

def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("zephyr")


# Register our adapters first to prioritise over defaults
register_model_adapter(H4MistralAdapter)

# Note: the registration order matters.
# The one registered earlier has a higher matching priority.
register_model_adapter(PeftModelAdapter)
Expand Down Expand Up @@ -1994,5 +2039,6 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
register_model_adapter(MicrosoftOrcaAdapter)
register_model_adapter(YiAdapter)


# After all adapters, try the default base adapter.
register_model_adapter(BaseModelAdapter)
14 changes: 14 additions & 0 deletions fastchat/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import sys
from typing import AsyncGenerator, Generator
import warnings
from huggingface_hub import list_repo_files
from huggingface_hub.utils._validators import HFValidationError

import requests

Expand Down Expand Up @@ -347,3 +349,15 @@ def str_to_torch_dtype(dtype: str):
return torch.bfloat16
else:
raise ValueError(f"Unrecognized dtype: {dtype}")


def is_adapter_model(model_name_or_path: str, revision: str = "main") -> bool:
try:
# Try first if model on a Hub repo
repo_files = list_repo_files(model_name_or_path, revision=revision)
except HFValidationError:
# If not, check local repo
repo_files = os.listdir(model_name_or_path)
return (
"adapter_model.safetensors" in repo_files or "adapter_model.bin" in repo_files
)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ dependencies = [
model_worker = ["accelerate>=0.21", "peft", "sentencepiece", "torch", "transformers>=4.31.0", "protobuf"]
webui = ["gradio"]
train = ["einops", "flash-attn>=2.0", "wandb"]
llm_judge = ["openai<1", "anthropic>=0.3", "ray"]
llm_judge = ["openai<=0.28.0", "anthropic>=0.3", "ray", "pandas"]
dev = ["black==23.3.0", "pylint==2.8.2"]

[project.urls]
Expand Down