From 336f9a83f0a7d91f096b5fd513e7e255f179d68b Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Wed, 25 Oct 2023 09:28:01 +0000 Subject: [PATCH 01/19] Add H4 adapter --- fastchat/model/model_adapter.py | 35 ++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py index 148ccc0cb..78637b6b6 100644 --- a/fastchat/model/model_adapter.py +++ b/fastchat/model/model_adapter.py @@ -171,6 +171,7 @@ def load_model( """Load a model from Hugging Face.""" # get model adapter adapter = get_model_adapter(model_path) + print(f"Using model adapter: {adapter.__class__.__name__} for model path {model_path} and revision {revision}") # Handle device mapping cpu_offloading = raise_warning_for_incompatible_cpu_offloading_configuration( @@ -1292,20 +1293,20 @@ def get_default_conv_template(self, model_path: str) -> Conversation: return get_conv_template("starchat") -class MistralAdapter(BaseModelAdapter): - """The model adapter for Mistral AI models""" +# class MistralAdapter(BaseModelAdapter): +# """The model adapter for Mistral AI models""" - def match(self, model_path: str): - return "mistral" in model_path.lower() +# def match(self, model_path: str): +# return "mistral" in model_path.lower() - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) - model.config.eos_token_id = tokenizer.eos_token_id - model.config.pad_token_id = tokenizer.pad_token_id - return model, tokenizer +# def load_model(self, model_path: str, from_pretrained_kwargs: dict): +# model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) +# model.config.eos_token_id = tokenizer.eos_token_id +# model.config.pad_token_id = tokenizer.pad_token_id +# return model, tokenizer - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("mistral") +# def get_default_conv_template(self, model_path: str) -> Conversation: +# return get_conv_template("mistral") class Llama2Adapter(BaseModelAdapter): @@ -1680,6 +1681,15 @@ def match(self, model_path: str): def get_default_conv_template(self, model_path: str) -> Conversation: return get_conv_template("zephyr") +class H4MistralAdapter(BaseModelAdapter): + """The model adapter for H4 Mistral models""" + + def match(self, model_path: str): + return "mistral" in model_path.lower() + + def get_default_conv_template(self, model_path: str) -> Conversation: + return get_conv_template("zephyr") + class XwinLMAdapter(BaseModelAdapter): """The model adapter for Xwin-LM V0.1 and V0.2 series of models(e.g., Xwin-LM/Xwin-LM-70B-V0.1)""" @@ -1737,7 +1747,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation: register_model_adapter(InternLMChatAdapter) register_model_adapter(StarChatAdapter) register_model_adapter(Llama2Adapter) -register_model_adapter(MistralAdapter) +# register_model_adapter(MistralAdapter) register_model_adapter(CuteGPTAdapter) register_model_adapter(OpenOrcaAdapter) register_model_adapter(WizardCoderAdapter) @@ -1755,6 +1765,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation: register_model_adapter(Llama2ChangAdapter) register_model_adapter(ZephyrAdapter) register_model_adapter(XwinLMAdapter) +register_model_adapter(H4MistralAdapter) # After all adapters, try the default base adapter. register_model_adapter(BaseModelAdapter) From 176d26d12c313a1e0c55ef878639158209e08164 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 23 Nov 2023 09:27:08 +0000 Subject: [PATCH 02/19] Restore Mistral adapter --- .gitignore | 3 +++ fastchat/model/model_adapter.py | 26 +++++++++++++------------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index 94b6e614d..82276c686 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,6 @@ tests/state_of_the_union.txt # Build build + +# Data +fastchat/llm_judge/data/ \ No newline at end of file diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py index a7539f134..0f8ab6baf 100644 --- a/fastchat/model/model_adapter.py +++ b/fastchat/model/model_adapter.py @@ -1388,20 +1388,20 @@ def get_default_conv_template(self, model_path: str) -> Conversation: return get_conv_template("starchat") -# class MistralAdapter(BaseModelAdapter): -# """The model adapter for Mistral AI models""" +class MistralAdapter(BaseModelAdapter): + """The model adapter for Mistral AI models""" -# def match(self, model_path: str): -# return "mistral" in model_path.lower() + def match(self, model_path: str): + return "mistral" in model_path.lower() -# def load_model(self, model_path: str, from_pretrained_kwargs: dict): -# model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) -# model.config.eos_token_id = tokenizer.eos_token_id -# model.config.pad_token_id = tokenizer.pad_token_id -# return model, tokenizer + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) + model.config.eos_token_id = tokenizer.eos_token_id + model.config.pad_token_id = tokenizer.pad_token_id + return model, tokenizer -# def get_default_conv_template(self, model_path: str) -> Conversation: -# return get_conv_template("mistral") + def get_default_conv_template(self, model_path: str) -> Conversation: + return get_conv_template("mistral") class Llama2Adapter(BaseModelAdapter): @@ -1839,7 +1839,7 @@ class H4MistralAdapter(BaseModelAdapter): """The model adapter for H4 Mistral models""" def match(self, model_path: str): - return "mistral" in model_path.lower() + return "HuggingFaceH4/mistral" in model_path.lower() def get_default_conv_template(self, model_path: str) -> Conversation: return get_conv_template("zephyr") @@ -1953,7 +1953,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation: register_model_adapter(InternLMChatAdapter) register_model_adapter(StarChatAdapter) register_model_adapter(Llama2Adapter) -# register_model_adapter(MistralAdapter) +register_model_adapter(MistralAdapter) register_model_adapter(CuteGPTAdapter) register_model_adapter(OpenOrcaAdapter) register_model_adapter(MistralAdapter) From d0ae0dd1b5faff094f6e5e22bfd49db8be81a46c Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 23 Nov 2023 09:28:12 +0000 Subject: [PATCH 03/19] Fix --- fastchat/model/model_adapter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py index 0f8ab6baf..e52ed4017 100644 --- a/fastchat/model/model_adapter.py +++ b/fastchat/model/model_adapter.py @@ -1953,7 +1953,6 @@ def get_default_conv_template(self, model_path: str) -> Conversation: register_model_adapter(InternLMChatAdapter) register_model_adapter(StarChatAdapter) register_model_adapter(Llama2Adapter) -register_model_adapter(MistralAdapter) register_model_adapter(CuteGPTAdapter) register_model_adapter(OpenOrcaAdapter) register_model_adapter(MistralAdapter) From 1fd1a98a6269cee35f3f6fb50451476afef6da27 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 23 Nov 2023 09:29:58 +0000 Subject: [PATCH 04/19] Disable configmation --- fastchat/llm_judge/gen_judgment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastchat/llm_judge/gen_judgment.py b/fastchat/llm_judge/gen_judgment.py index a1c70b295..7b1b18116 100644 --- a/fastchat/llm_judge/gen_judgment.py +++ b/fastchat/llm_judge/gen_judgment.py @@ -301,7 +301,7 @@ def make_judge_single(judge_model, judge_prompts): # Show match stats and prompt enter to continue print("Stats:") print(json.dumps(match_stat, indent=4)) - input("Press Enter to confirm...") + # input("Press Enter to confirm...") # Play matches if args.parallel == 1: From e15bae6ad062351b3c0fdc6a2f839b13e51f0e97 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 23 Nov 2023 12:10:12 +0000 Subject: [PATCH 05/19] Add table --- fastchat/llm_judge/run.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100755 fastchat/llm_judge/run.sh diff --git a/fastchat/llm_judge/run.sh b/fastchat/llm_judge/run.sh new file mode 100755 index 000000000..db9938a64 --- /dev/null +++ b/fastchat/llm_judge/run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -x -e +HUB_MODEL_ID=$1 +MT_BENCH_ID=$2 +[ -z "$3" ] && DTYPR="float16" || DTYPE=$3 + +# Generate answer +python gen_model_answer.py --model-path $HUB_MODEL_ID --model-id $MT_BENCH_ID --dtype $DTYPE + +# Judge! +python gen_judgment.py --model-list $MT_BENCH_ID + +# Get results +python show_result.py \ No newline at end of file From ed40de49b84c8750a4449c27e22b3b0df3ae57fc Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 23 Nov 2023 13:36:02 +0000 Subject: [PATCH 06/19] Fix script --- fastchat/llm_judge/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastchat/llm_judge/run.sh b/fastchat/llm_judge/run.sh index db9938a64..f7f6a319b 100755 --- a/fastchat/llm_judge/run.sh +++ b/fastchat/llm_judge/run.sh @@ -3,7 +3,7 @@ set -x -e HUB_MODEL_ID=$1 MT_BENCH_ID=$2 -[ -z "$3" ] && DTYPR="float16" || DTYPE=$3 +[ -z "$3" ] && DTYPE="float16" || DTYPE=$3 # Generate answer python gen_model_answer.py --model-path $HUB_MODEL_ID --model-id $MT_BENCH_ID --dtype $DTYPE From d7e5995306e253b182fdbcd63041bb7342975255 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 23 Nov 2023 13:56:03 +0000 Subject: [PATCH 07/19] Refactor --- fastchat/model/model_adapter.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py index 11987515b..933dec262 100644 --- a/fastchat/model/model_adapter.py +++ b/fastchat/model/model_adapter.py @@ -1862,15 +1862,6 @@ def match(self, model_path: str): def get_default_conv_template(self, model_path: str) -> Conversation: return get_conv_template("zephyr") -class H4MistralAdapter(BaseModelAdapter): - """The model adapter for H4 Mistral models""" - - def match(self, model_path: str): - return "HuggingFaceH4/mistral" in model_path.lower() - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("zephyr") - class XwinLMAdapter(BaseModelAdapter): """The model adapter for Xwin-LM V0.1 and V0.2 series of models(e.g., Xwin-LM/Xwin-LM-70B-V0.1)""" @@ -1932,6 +1923,18 @@ def get_default_conv_template(self, model_path: str) -> Conversation: return get_conv_template("Yi-34b-chat") +############# +# H4 Adapters +############# +class H4MistralAdapter(BaseModelAdapter): + """The model adapter for H4 Mistral models""" + + def match(self, model_path: str): + return "HuggingFaceH4" in model_path and "mistral" in model_path.lower() + + def get_default_conv_template(self, model_path: str) -> Conversation: + return get_conv_template("zephyr") + # Note: the registration order matters. # The one registered earlier has a higher matching priority. register_model_adapter(PeftModelAdapter) @@ -2000,10 +2003,14 @@ def get_default_conv_template(self, model_path: str) -> Conversation: register_model_adapter(ZephyrAdapter) register_model_adapter(XwinLMAdapter) register_model_adapter(LemurAdapter) -register_model_adapter(H4MistralAdapter) register_model_adapter(PygmalionAdapter) register_model_adapter(MicrosoftOrcaAdapter) register_model_adapter(YiAdapter) +############# +# H4 Adapters +############# +register_model_adapter(H4MistralAdapter) + # After all adapters, try the default base adapter. register_model_adapter(BaseModelAdapter) From 04f556f60923ccbcfea2e7359ddd974d4375169a Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 23 Nov 2023 14:04:09 +0000 Subject: [PATCH 08/19] Add model revision --- fastchat/llm_judge/gen_model_answer.py | 9 +++++++++ fastchat/llm_judge/run.sh | 3 ++- fastchat/model/model_adapter.py | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/fastchat/llm_judge/gen_model_answer.py b/fastchat/llm_judge/gen_model_answer.py index be399750f..e659c2fd9 100644 --- a/fastchat/llm_judge/gen_model_answer.py +++ b/fastchat/llm_judge/gen_model_answer.py @@ -31,6 +31,7 @@ def run_eval( num_gpus_total, max_gpu_memory, dtype, + revision, ): questions = load_questions(question_file, question_begin, question_end) # random shuffle the questions to balance the loading @@ -79,9 +80,11 @@ def get_model_answers( num_gpus_per_model, max_gpu_memory, dtype, + revision, ): model, tokenizer = load_model( model_path, + revision=revision, device="cuda", num_gpus=num_gpus_per_model, max_gpu_memory=max_gpu_memory, @@ -259,6 +262,12 @@ def reorg_answer_file(answer_file): help="Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.", default=None, ) + parser.add_argument( + "--model-revision", + type=str, + default="main", + help="The revision of the model on the Hugging Face Hub.", + ) args = parser.parse_args() diff --git a/fastchat/llm_judge/run.sh b/fastchat/llm_judge/run.sh index f7f6a319b..c30a8f52b 100755 --- a/fastchat/llm_judge/run.sh +++ b/fastchat/llm_judge/run.sh @@ -3,7 +3,8 @@ set -x -e HUB_MODEL_ID=$1 MT_BENCH_ID=$2 -[ -z "$3" ] && DTYPE="float16" || DTYPE=$3 +[ -z "$3" ] && REVISION="main" || REVISION=$3 +[ -z "$4" ] && DTYPE="float16" || DTYPE=$4 # Generate answer python gen_model_answer.py --model-path $HUB_MODEL_ID --model-id $MT_BENCH_ID --dtype $DTYPE diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py index 933dec262..66ffb26a4 100644 --- a/fastchat/model/model_adapter.py +++ b/fastchat/model/model_adapter.py @@ -179,7 +179,7 @@ def load_model( """Load a model from Hugging Face.""" # get model adapter adapter = get_model_adapter(model_path) - print(f"Using model adapter: {adapter.__class__.__name__} for model path {model_path} and revision {revision}") + print(f"Using model adapter: {adapter.__class__.__name__} for {model_path=} and {revision=}") # Handle device mapping cpu_offloading = raise_warning_for_incompatible_cpu_offloading_configuration( From 7347d4d5b23f4934f974f4f7d84240be6406fce5 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 23 Nov 2023 14:05:35 +0000 Subject: [PATCH 09/19] Clean --- fastchat/llm_judge/gen_judgment.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fastchat/llm_judge/gen_judgment.py b/fastchat/llm_judge/gen_judgment.py index 7b1b18116..867a8208e 100644 --- a/fastchat/llm_judge/gen_judgment.py +++ b/fastchat/llm_judge/gen_judgment.py @@ -301,7 +301,6 @@ def make_judge_single(judge_model, judge_prompts): # Show match stats and prompt enter to continue print("Stats:") print(json.dumps(match_stat, indent=4)) - # input("Press Enter to confirm...") # Play matches if args.parallel == 1: From f3beee413918b8e56d9b322ebefb1ff089e1e78f Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 23 Nov 2023 14:06:55 +0000 Subject: [PATCH 10/19] Add revision to script --- fastchat/llm_judge/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastchat/llm_judge/run.sh b/fastchat/llm_judge/run.sh index c30a8f52b..68faadaed 100755 --- a/fastchat/llm_judge/run.sh +++ b/fastchat/llm_judge/run.sh @@ -7,7 +7,7 @@ MT_BENCH_ID=$2 [ -z "$4" ] && DTYPE="float16" || DTYPE=$4 # Generate answer -python gen_model_answer.py --model-path $HUB_MODEL_ID --model-id $MT_BENCH_ID --dtype $DTYPE +python gen_model_answer.py --model-path $HUB_MODEL_ID --model-revision $REVISION --model-id $MT_BENCH_ID --dtype $DTYPE # Judge! python gen_judgment.py --model-list $MT_BENCH_ID From e696df08541079aac108e84c9433fbbc6695c118 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 23 Nov 2023 14:12:46 +0000 Subject: [PATCH 11/19] Pin OAI dep --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b6db03490..68bfe642a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ model_worker = ["accelerate>=0.21", "peft", "sentencepiece", "torch", "transformers>=4.31.0", "protobuf"] webui = ["gradio"] train = ["einops", "flash-attn>=2.0", "wandb"] -llm_judge = ["openai<1", "anthropic>=0.3", "ray"] +llm_judge = ["openai<=0.28.0", "anthropic>=0.3", "ray"] dev = ["black==23.3.0", "pylint==2.8.2"] [project.urls] From 332a49cef9f57850889f5d72cb24cf935d982bac Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 23 Nov 2023 14:17:33 +0000 Subject: [PATCH 12/19] Fix dep --- fastchat/llm_judge/README.md | 14 ++++++++++++++ fastchat/llm_judge/run.sh | 9 +++++---- pyproject.toml | 2 +- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/fastchat/llm_judge/README.md b/fastchat/llm_judge/README.md index 1d2646b13..d8b6caa0d 100644 --- a/fastchat/llm_judge/README.md +++ b/fastchat/llm_judge/README.md @@ -87,6 +87,20 @@ The judgments will be saved to `data/mt_bench/model_judgment/gpt-4_single.jsonl` --- +#### Run all steps together + +We provide a script that runs all steps together as follows: + +```shell +run.sh {HUB_MODEL_ID} {HUB_MODEL_REVISION} {DTYPE} {MT_BENCH_ID} +``` + +For example, to evaluate `zephyr-7b-beta` you can run: + +```shell +run.sh +``` + ### Other grading options Besides score-based single-answer grading, we also support two additional grading options based on win rates: - `pariwise-baseline`: run pairwise comparison against a baseline model. diff --git a/fastchat/llm_judge/run.sh b/fastchat/llm_judge/run.sh index 68faadaed..5eb47dbd2 100755 --- a/fastchat/llm_judge/run.sh +++ b/fastchat/llm_judge/run.sh @@ -1,13 +1,14 @@ #!/bin/bash set -x -e + HUB_MODEL_ID=$1 -MT_BENCH_ID=$2 -[ -z "$3" ] && REVISION="main" || REVISION=$3 -[ -z "$4" ] && DTYPE="float16" || DTYPE=$4 +[ -z "$2" ] && REVISION="main" || REVISION=$2 +[ -z "$3" ] && DTYPE="float16" || DTYPE=$3 +MT_BENCH_ID=$4 # Generate answer -python gen_model_answer.py --model-path $HUB_MODEL_ID --model-revision $REVISION --model-id $MT_BENCH_ID --dtype $DTYPE +python gen_model_answer.py --model-path $HUB_MODEL_ID --model-revision $REVISION --dtype $DTYPE --model-id $MT_BENCH_ID # Judge! python gen_judgment.py --model-list $MT_BENCH_ID diff --git a/pyproject.toml b/pyproject.toml index 68bfe642a..ee9ee7404 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ model_worker = ["accelerate>=0.21", "peft", "sentencepiece", "torch", "transformers>=4.31.0", "protobuf"] webui = ["gradio"] train = ["einops", "flash-attn>=2.0", "wandb"] -llm_judge = ["openai<=0.28.0", "anthropic>=0.3", "ray"] +llm_judge = ["openai<=0.28.0", "anthropic>=0.3", "ray", "pandas"] dev = ["black==23.3.0", "pylint==2.8.2"] [project.urls] From de55e69e0d2e92b6ac4cbec3e907718a215afc7f Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 23 Nov 2023 16:43:51 +0000 Subject: [PATCH 13/19] Fix revision --- fastchat/llm_judge/README.md | 2 +- fastchat/llm_judge/gen_model_answer.py | 4 +++- fastchat/llm_judge/run.sh | 8 ++++---- fastchat/model/model_adapter.py | 6 +++--- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/fastchat/llm_judge/README.md b/fastchat/llm_judge/README.md index d8b6caa0d..9b7b71cff 100644 --- a/fastchat/llm_judge/README.md +++ b/fastchat/llm_judge/README.md @@ -92,7 +92,7 @@ The judgments will be saved to `data/mt_bench/model_judgment/gpt-4_single.jsonl` We provide a script that runs all steps together as follows: ```shell -run.sh {HUB_MODEL_ID} {HUB_MODEL_REVISION} {DTYPE} {MT_BENCH_ID} +run.sh {HUB_MODEL_ID} {MT_BENCH_ID} {HUB_MODEL_REVISION} {DTYPE} ``` For example, to evaluate `zephyr-7b-beta` you can run: diff --git a/fastchat/llm_judge/gen_model_answer.py b/fastchat/llm_judge/gen_model_answer.py index e659c2fd9..890dbc431 100644 --- a/fastchat/llm_judge/gen_model_answer.py +++ b/fastchat/llm_judge/gen_model_answer.py @@ -62,6 +62,7 @@ def run_eval( num_gpus_per_model, max_gpu_memory, dtype=dtype, + revision=revision, ) ) @@ -263,7 +264,7 @@ def reorg_answer_file(answer_file): default=None, ) parser.add_argument( - "--model-revision", + "--revision", type=str, default="main", help="The revision of the model on the Hugging Face Hub.", @@ -297,6 +298,7 @@ def reorg_answer_file(answer_file): num_gpus_total=args.num_gpus_total, max_gpu_memory=args.max_gpu_memory, dtype=str_to_torch_dtype(args.dtype), + revision=args.revision, ) reorg_answer_file(answer_file) diff --git a/fastchat/llm_judge/run.sh b/fastchat/llm_judge/run.sh index 5eb47dbd2..78cc88d1a 100755 --- a/fastchat/llm_judge/run.sh +++ b/fastchat/llm_judge/run.sh @@ -3,12 +3,12 @@ set -x -e HUB_MODEL_ID=$1 -[ -z "$2" ] && REVISION="main" || REVISION=$2 -[ -z "$3" ] && DTYPE="float16" || DTYPE=$3 -MT_BENCH_ID=$4 +MT_BENCH_ID=$2 +[ -z "$3" ] && REVISION="main" || REVISION=$3 +[ -z "$4" ] && DTYPE="float16" || DTYPE=$4 # Generate answer -python gen_model_answer.py --model-path $HUB_MODEL_ID --model-revision $REVISION --dtype $DTYPE --model-id $MT_BENCH_ID +python gen_model_answer.py --model-path $HUB_MODEL_ID --revision $REVISION --dtype $DTYPE --model-id $MT_BENCH_ID # Judge! python gen_judgment.py --model-list $MT_BENCH_ID diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py index 66ffb26a4..66f416dff 100644 --- a/fastchat/model/model_adapter.py +++ b/fastchat/model/model_adapter.py @@ -1482,9 +1482,9 @@ class Hermes2Adapter(BaseModelAdapter): use_fast_tokenizer = False def match(self, model_path: str): - return ( - "openhermes-2.5-mistral-7b" - or "openhermes-2-mistral-7b" in model_path.lower() + return any( + model_str in model_path.lower() + for model_str in ["openhermes-2.5-mistral-7b", "openhermes-2-mistral-7b"] ) def load_model(self, model_path: str, from_pretrained_kwargs: dict): From aaf688dd199f515364c837b592f733485dc9b4f3 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sat, 25 Nov 2023 09:11:17 +0000 Subject: [PATCH 14/19] Tweak --- fastchat/llm_judge/README.md | 14 +++++++++++++- fastchat/model/model_adapter.py | 16 +++++++++------- fastchat/utils.py | 11 +++++++++++ 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/fastchat/llm_judge/README.md b/fastchat/llm_judge/README.md index 9b7b71cff..990b1ce66 100644 --- a/fastchat/llm_judge/README.md +++ b/fastchat/llm_judge/README.md @@ -98,7 +98,19 @@ run.sh {HUB_MODEL_ID} {MT_BENCH_ID} {HUB_MODEL_REVISION} {DTYPE} For example, to evaluate `zephyr-7b-beta` you can run: ```shell -run.sh +./run.sh HuggingFaceH4/zephyr-7b-beta zephyr-7b-beta +``` + +To evaluate a specific revision, run: + +```shell +./run.sh HuggingFaceH4/mistral-7b-dpo mistral-7b-dpo_v8.2 v8.2 +``` + +To evaluate a specific revision and dtype (`float16` is the default and recommended for most models), run: + +```shell +./run.sh HuggingFaceH4/mistral-7b-dpo mistral-7b-dpo_v8.2 v8.2 bfloat16 ``` ### Other grading options diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py index 66f416dff..e12425e9c 100644 --- a/fastchat/model/model_adapter.py +++ b/fastchat/model/model_adapter.py @@ -42,7 +42,7 @@ from fastchat.modules.exllama import ExllamaConfig, load_exllama_model from fastchat.modules.xfastertransformer import load_xft_model, XftConfig from fastchat.modules.gptq import GptqConfig, load_gptq_quantized -from fastchat.utils import get_gpu_memory +from fastchat.utils import get_gpu_memory, is_adapter_model # Check an environment variable to check if we should be sharing Peft model # weights. When false we treat all Peft models as separate. @@ -179,7 +179,9 @@ def load_model( """Load a model from Hugging Face.""" # get model adapter adapter = get_model_adapter(model_path) - print(f"Using model adapter: {adapter.__class__.__name__} for {model_path=} and {revision=}") + print( + f"Using model adapter: {adapter.__class__.__name__} for {model_path=} and {revision=}" + ) # Handle device mapping cpu_offloading = raise_warning_for_incompatible_cpu_offloading_configuration( @@ -1930,11 +1932,15 @@ class H4MistralAdapter(BaseModelAdapter): """The model adapter for H4 Mistral models""" def match(self, model_path: str): - return "HuggingFaceH4" in model_path and "mistral" in model_path.lower() + return "mistral" in model_path.lower() def get_default_conv_template(self, model_path: str) -> Conversation: return get_conv_template("zephyr") + +# Register our adapters first to prioritise over defaults +register_model_adapter(H4MistralAdapter) + # Note: the registration order matters. # The one registered earlier has a higher matching priority. register_model_adapter(PeftModelAdapter) @@ -2007,10 +2013,6 @@ def get_default_conv_template(self, model_path: str) -> Conversation: register_model_adapter(MicrosoftOrcaAdapter) register_model_adapter(YiAdapter) -############# -# H4 Adapters -############# -register_model_adapter(H4MistralAdapter) # After all adapters, try the default base adapter. register_model_adapter(BaseModelAdapter) diff --git a/fastchat/utils.py b/fastchat/utils.py index b5e3ba543..2008a1c24 100644 --- a/fastchat/utils.py +++ b/fastchat/utils.py @@ -10,6 +10,8 @@ import sys from typing import AsyncGenerator, Generator import warnings +from huggingface_hub import list_repo_files +from huggingface_hub.utils._validators import HFValidationError import requests @@ -347,3 +349,12 @@ def str_to_torch_dtype(dtype: str): return torch.bfloat16 else: raise ValueError(f"Unrecognized dtype: {dtype}") + + +def is_adapter_model(model_name_or_path: str, revision: str = "main") -> bool: + try: + repo_files = list_repo_files(model_name_or_path, revision=revision) + except HFValidationError: + # check local files + repo_files = os.listdir(model_name_or_path) + return "adapter_model.bin" in repo_files From e562e1d2aac67f4aedb485185cdbfdca609bd253 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sat, 25 Nov 2023 09:58:49 +0000 Subject: [PATCH 15/19] Add PEFT revision --- fastchat/llm_judge/gen_model_answer.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fastchat/llm_judge/gen_model_answer.py b/fastchat/llm_judge/gen_model_answer.py index 890dbc431..82df129d8 100644 --- a/fastchat/llm_judge/gen_model_answer.py +++ b/fastchat/llm_judge/gen_model_answer.py @@ -32,6 +32,7 @@ def run_eval( max_gpu_memory, dtype, revision, + base_model_revision, ): questions = load_questions(question_file, question_begin, question_end) # random shuffle the questions to balance the loading @@ -63,6 +64,7 @@ def run_eval( max_gpu_memory, dtype=dtype, revision=revision, + base_model_revision=base_model_revision, ) ) @@ -82,10 +84,12 @@ def get_model_answers( max_gpu_memory, dtype, revision, + base_model_revision, ): model, tokenizer = load_model( model_path, revision=revision, + base_model_revision=base_model_revision, device="cuda", num_gpus=num_gpus_per_model, max_gpu_memory=max_gpu_memory, @@ -104,7 +108,7 @@ def get_model_answers( choices = [] for i in range(num_choices): torch.manual_seed(i) - conv = get_conversation_template(model_id) + conv = get_conversation_template(model_path) turns = [] for j in range(len(question["turns"])): qs = question["turns"][j] @@ -121,7 +125,7 @@ def get_model_answers( # some models may error out when generating long outputs try: output_ids = model.generate( - torch.as_tensor(input_ids).cuda(), + inputs=torch.as_tensor(input_ids).cuda(), do_sample=do_sample, temperature=temperature, max_new_tokens=max_new_token, @@ -269,6 +273,12 @@ def reorg_answer_file(answer_file): default="main", help="The revision of the model on the Hugging Face Hub.", ) + parser.add_argument( + "--base-model-revision", + type=str, + default="main", + help="The revision of the base model for PEFT adapters.", + ) args = parser.parse_args() @@ -299,6 +309,7 @@ def reorg_answer_file(answer_file): max_gpu_memory=args.max_gpu_memory, dtype=str_to_torch_dtype(args.dtype), revision=args.revision, + base_model_revision=args.base_model_revision, ) reorg_answer_file(answer_file) From fc4a40f3216cbecca4272348293c7ae7643e26ab Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sat, 25 Nov 2023 10:03:26 +0000 Subject: [PATCH 16/19] Style --- fastchat/model/model_adapter.py | 24 ++++++++++++++++++------ fastchat/utils.py | 7 +++++-- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py index e12425e9c..5e9536562 100644 --- a/fastchat/model/model_adapter.py +++ b/fastchat/model/model_adapter.py @@ -118,8 +118,11 @@ def register_model_adapter(cls): @cache -def get_model_adapter(model_path: str) -> BaseModelAdapter: +def get_model_adapter(model_path: str, revision: str = "main") -> BaseModelAdapter: """Get a model adapter for a model_path.""" + if is_adapter_model(model_path, revision=revision) is True: + return PeftModelAdapter() + model_path_basename = os.path.basename(os.path.normpath(model_path)) # Try the basename of model_path at first @@ -174,11 +177,12 @@ def load_model( exllama_config: Optional[ExllamaConfig] = None, xft_config: Optional[XftConfig] = None, revision: str = "main", + base_model_revision: str = "main", debug: bool = False, ): """Load a model from Hugging Face.""" # get model adapter - adapter = get_model_adapter(model_path) + adapter = get_model_adapter(model_path, revision=revision) print( f"Using model adapter: {adapter.__class__.__name__} for {model_path=} and {revision=}" ) @@ -308,6 +312,9 @@ def load_model( return model, tokenizer kwargs["revision"] = revision + if is_adapter_model(model_path, revision=revision) is True: + kwargs["base_model_revision"] = base_model_revision + if dtype is not None: # Overwrite dtype if it is provided in the arguments. kwargs["torch_dtype"] = dtype @@ -544,7 +551,7 @@ class PeftModelAdapter: def match(self, model_path: str): """Accepts any model path with "peft" in the name""" - if os.path.exists(os.path.join(model_path, "adapter_config.json")): + if is_adapter_model(model_path): return True return "peft" in model_path.lower() @@ -552,7 +559,8 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict): """Loads the base model then the (peft) adapter weights""" from peft import PeftConfig, PeftModel - config = PeftConfig.from_pretrained(model_path) + revision = from_pretrained_kwargs.get("revision", "main") + config = PeftConfig.from_pretrained(model_path, revision=revision) base_model_path = config.base_model_name_or_path if "peft" in base_model_path: raise ValueError( @@ -584,17 +592,21 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict): # Super important: make sure we use model_path as the # `adapter_name`. model = PeftModel.from_pretrained( - base_model, model_path, adapter_name=model_path + base_model, model_path, adapter_name=model_path, revision=revision ) peft_model_cache[base_model_path] = (model, tokenizer) return model, tokenizer # In the normal case, load up the base model weights again. base_adapter = get_model_adapter(base_model_path) + from_pretrained_kwargs["revision"] = from_pretrained_kwargs.get( + "base_model_revision", "main" + ) + from_pretrained_kwargs.pop("base_model_revision", None) base_model, tokenizer = base_adapter.load_model( base_model_path, from_pretrained_kwargs ) - model = PeftModel.from_pretrained(base_model, model_path) + model = PeftModel.from_pretrained(base_model, model_path, revision=revision) return model, tokenizer def get_default_conv_template(self, model_path: str) -> Conversation: diff --git a/fastchat/utils.py b/fastchat/utils.py index 2008a1c24..90a66cf4d 100644 --- a/fastchat/utils.py +++ b/fastchat/utils.py @@ -353,8 +353,11 @@ def str_to_torch_dtype(dtype: str): def is_adapter_model(model_name_or_path: str, revision: str = "main") -> bool: try: + # Try first if model on a Hub repo repo_files = list_repo_files(model_name_or_path, revision=revision) except HFValidationError: - # check local files + # If not, check local repo repo_files = os.listdir(model_name_or_path) - return "adapter_model.bin" in repo_files + return ( + "adapter_model.safetensors" in repo_files or "adapter_model.bin" in repo_files + ) From 85f828a613ac90118afbf4ebd6d1aa4fcca448db Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sat, 25 Nov 2023 10:22:39 +0000 Subject: [PATCH 17/19] Log --- fastchat/llm_judge/gen_model_answer.py | 1 + fastchat/model/model_adapter.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/fastchat/llm_judge/gen_model_answer.py b/fastchat/llm_judge/gen_model_answer.py index 82df129d8..8395640e0 100644 --- a/fastchat/llm_judge/gen_model_answer.py +++ b/fastchat/llm_judge/gen_model_answer.py @@ -109,6 +109,7 @@ def get_model_answers( for i in range(num_choices): torch.manual_seed(i) conv = get_conversation_template(model_path) + print(f"Using chat template {conv.name} to generate answers") turns = [] for j in range(len(question["turns"])): qs = question["turns"][j] diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py index 5e9536562..f939e32eb 100644 --- a/fastchat/model/model_adapter.py +++ b/fastchat/model/model_adapter.py @@ -599,6 +599,8 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict): # In the normal case, load up the base model weights again. base_adapter = get_model_adapter(base_model_path) + + # h4: we override the `revision` arg to point to the revision of the base model instead of the adapter one. from_pretrained_kwargs["revision"] = from_pretrained_kwargs.get( "base_model_revision", "main" ) From e587f31b452f6dbcee47a2fc6846f28d8fb34404 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sat, 25 Nov 2023 10:23:08 +0000 Subject: [PATCH 18/19] Fix --- fastchat/llm_judge/gen_model_answer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastchat/llm_judge/gen_model_answer.py b/fastchat/llm_judge/gen_model_answer.py index 8395640e0..89272c5b3 100644 --- a/fastchat/llm_judge/gen_model_answer.py +++ b/fastchat/llm_judge/gen_model_answer.py @@ -109,7 +109,7 @@ def get_model_answers( for i in range(num_choices): torch.manual_seed(i) conv = get_conversation_template(model_path) - print(f"Using chat template {conv.name} to generate answers") + print(f"Using chat template `{conv.name}` to generate answers") turns = [] for j in range(len(question["turns"])): qs = question["turns"][j] From f3a30b5f43aa2dbe4406386cb63cae0c7824fc11 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sat, 25 Nov 2023 11:08:49 +0000 Subject: [PATCH 19/19] Exclude GPT-4 --- fastchat/llm_judge/gen_model_answer.py | 6 +++++- fastchat/model/model_adapter.py | 14 +++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/fastchat/llm_judge/gen_model_answer.py b/fastchat/llm_judge/gen_model_answer.py index 89272c5b3..a4f4bd351 100644 --- a/fastchat/llm_judge/gen_model_answer.py +++ b/fastchat/llm_judge/gen_model_answer.py @@ -234,6 +234,7 @@ def reorg_answer_file(answer_file): parser.add_argument( "--question-end", type=int, help="A debug option. The end index of questions." ) + parser.add_argument("--question-file", type=str, help="The input question file.") parser.add_argument("--answer-file", type=str, help="The output answer file.") parser.add_argument( "--max-new-token", @@ -288,7 +289,10 @@ def reorg_answer_file(answer_file): ray.init() - question_file = f"data/{args.bench_name}/question.jsonl" + if args.question_file: + question_file = args.question_file + else: + question_file = f"data/{args.bench_name}/question.jsonl" if args.answer_file: answer_file = args.answer_file else: diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py index f939e32eb..05a7ca2d8 100644 --- a/fastchat/model/model_adapter.py +++ b/fastchat/model/model_adapter.py @@ -56,6 +56,13 @@ "claude-instant-1", ) +OPENAI_MODEL_LIST = ( + "gpt-4", + "gpt-3.5-turbo", +) + +JUDGE_MODEL_LIST = ANTHROPIC_MODEL_LIST + OPENAI_MODEL_LIST + class BaseModelAdapter: """The base and the default model adapter.""" @@ -120,7 +127,12 @@ def register_model_adapter(cls): @cache def get_model_adapter(model_path: str, revision: str = "main") -> BaseModelAdapter: """Get a model adapter for a model_path.""" - if is_adapter_model(model_path, revision=revision) is True: + + # Exclude judge LLMs from the model adapter list + if ( + model_path not in JUDGE_MODEL_LIST + and is_adapter_model(model_path, revision=revision) is True + ): return PeftModelAdapter() model_path_basename = os.path.basename(os.path.normpath(model_path))