-
Notifications
You must be signed in to change notification settings - Fork 3
H4 v2 #4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
H4 v2 #4
Changes from all commits
336f9a8
98c1c2f
be606d6
176d26d
d0ae0dd
1fd1a98
e15bae6
ed40de4
4e964f0
d7e5995
04f556f
7347d4d
f3beee4
e696df0
332a49c
de55e69
aaf688d
e562e1d
fc4a40f
85f828a
e587f31
f3a30b5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,3 +29,6 @@ tests/state_of_the_union.txt | |
|
|
||
| # Build | ||
| build | ||
|
|
||
| # Data | ||
| fastchat/llm_judge/data/ | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -301,7 +301,6 @@ def make_judge_single(judge_model, judge_prompts): | |
| # Show match stats and prompt enter to continue | ||
| print("Stats:") | ||
| print(json.dumps(match_stat, indent=4)) | ||
| input("Press Enter to confirm...") | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed so we can run evals without confirmation |
||
|
|
||
| # Play matches | ||
| if args.parallel == 1: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,8 @@ def run_eval( | |
| num_gpus_total, | ||
| max_gpu_memory, | ||
| dtype, | ||
| revision, | ||
| base_model_revision, | ||
| ): | ||
| questions = load_questions(question_file, question_begin, question_end) | ||
| # random shuffle the questions to balance the loading | ||
|
|
@@ -61,6 +63,8 @@ def run_eval( | |
| num_gpus_per_model, | ||
| max_gpu_memory, | ||
| dtype=dtype, | ||
| revision=revision, | ||
| base_model_revision=base_model_revision, | ||
| ) | ||
| ) | ||
|
|
||
|
|
@@ -79,9 +83,13 @@ def get_model_answers( | |
| num_gpus_per_model, | ||
| max_gpu_memory, | ||
| dtype, | ||
| revision, | ||
| base_model_revision, | ||
| ): | ||
| model, tokenizer = load_model( | ||
| model_path, | ||
| revision=revision, | ||
| base_model_revision=base_model_revision, | ||
| device="cuda", | ||
| num_gpus=num_gpus_per_model, | ||
| max_gpu_memory=max_gpu_memory, | ||
|
|
@@ -100,7 +108,8 @@ def get_model_answers( | |
| choices = [] | ||
| for i in range(num_choices): | ||
| torch.manual_seed(i) | ||
| conv = get_conversation_template(model_id) | ||
| conv = get_conversation_template(model_path) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is needed because the |
||
| print(f"Using chat template `{conv.name}` to generate answers") | ||
| turns = [] | ||
| for j in range(len(question["turns"])): | ||
| qs = question["turns"][j] | ||
|
|
@@ -117,7 +126,7 @@ def get_model_answers( | |
| # some models may error out when generating long outputs | ||
| try: | ||
| output_ids = model.generate( | ||
| torch.as_tensor(input_ids).cuda(), | ||
| inputs=torch.as_tensor(input_ids).cuda(), | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Needed for |
||
| do_sample=do_sample, | ||
| temperature=temperature, | ||
| max_new_tokens=max_new_token, | ||
|
|
@@ -225,6 +234,7 @@ def reorg_answer_file(answer_file): | |
| parser.add_argument( | ||
| "--question-end", type=int, help="A debug option. The end index of questions." | ||
| ) | ||
| parser.add_argument("--question-file", type=str, help="The input question file.") | ||
| parser.add_argument("--answer-file", type=str, help="The output answer file.") | ||
| parser.add_argument( | ||
| "--max-new-token", | ||
|
|
@@ -259,6 +269,18 @@ def reorg_answer_file(answer_file): | |
| help="Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.", | ||
| default=None, | ||
| ) | ||
| parser.add_argument( | ||
| "--revision", | ||
| type=str, | ||
| default="main", | ||
| help="The revision of the model on the Hugging Face Hub.", | ||
| ) | ||
| parser.add_argument( | ||
| "--base-model-revision", | ||
| type=str, | ||
| default="main", | ||
| help="The revision of the base model for PEFT adapters.", | ||
| ) | ||
|
|
||
| args = parser.parse_args() | ||
|
|
||
|
|
@@ -267,7 +289,10 @@ def reorg_answer_file(answer_file): | |
|
|
||
| ray.init() | ||
|
|
||
| question_file = f"data/{args.bench_name}/question.jsonl" | ||
| if args.question_file: | ||
| question_file = args.question_file | ||
| else: | ||
| question_file = f"data/{args.bench_name}/question.jsonl" | ||
| if args.answer_file: | ||
| answer_file = args.answer_file | ||
| else: | ||
|
|
@@ -288,6 +313,8 @@ def reorg_answer_file(answer_file): | |
| num_gpus_total=args.num_gpus_total, | ||
| max_gpu_memory=args.max_gpu_memory, | ||
| dtype=str_to_torch_dtype(args.dtype), | ||
| revision=args.revision, | ||
| base_model_revision=args.base_model_revision, | ||
| ) | ||
|
|
||
| reorg_answer_file(answer_file) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,17 @@ | ||
| #!/bin/bash | ||
|
|
||
| set -x -e | ||
|
|
||
| HUB_MODEL_ID=$1 | ||
| MT_BENCH_ID=$2 | ||
| [ -z "$3" ] && REVISION="main" || REVISION=$3 | ||
| [ -z "$4" ] && DTYPE="float16" || DTYPE=$4 | ||
|
|
||
| # Generate answer | ||
| python gen_model_answer.py --model-path $HUB_MODEL_ID --revision $REVISION --dtype $DTYPE --model-id $MT_BENCH_ID | ||
|
|
||
| # Judge! | ||
| python gen_judgment.py --model-list $MT_BENCH_ID | ||
|
|
||
| # Get results | ||
| python show_result.py |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note for internal use - not to be upstreamed