diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..726fb45 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +claude.md diff --git a/common.py b/common.py new file mode 100644 index 0000000..4459ba7 --- /dev/null +++ b/common.py @@ -0,0 +1,186 @@ +""" +Common utility functions for ToMBench evaluation scripts. +This module contains shared functions used by both API-based and HuggingFace evaluation. +""" +import random +from prompts import * + + +def format_prompt_4(d, args): + """ + Format prompt for 4-choice questions. + + Args: + d: Data dictionary containing the question and options + args: Arguments containing language setting + + Returns: + tuple: (mapping_dict, formatted_prompt) + """ + if args.language == 'zh': + cA = d['选项A'].replace("A. ", "") + cB = d['选项B'].replace("B. ", "") + cC = d['选项C'].replace("C. ", "") + cD = d['选项D'].replace("D. ", "") + choices = [cA, cB, cC, cD] + random.shuffle(choices) + prompt = UserEvaluatePrompt4Choices_zh.format(story=d['故事'], question=d['问题'], choice_a=choices[0], choice_b=choices[1], choice_c=choices[2], choice_d=choices[3]) + map = {"A": "", "B": "", "C": "", "D": ""} + + if choices[0] == cA: + map['A'] = 'A' + elif choices[0] == cB: + map['A'] = 'B' + elif choices[0] == cC: + map['A'] = 'C' + elif choices[0] == cD: + map['A'] = 'D' + + if choices[1] == cA: + map['B'] = 'A' + elif choices[1] == cB: + map['B'] = 'B' + elif choices[1] == cC: + map['B'] = 'C' + elif choices[1] == cD: + map['B'] = 'D' + + if choices[2] == cA: + map['C'] = 'A' + elif choices[2] == cB: + map['C'] = 'B' + elif choices[2] == cC: + map['C'] = 'C' + elif choices[2] == cD: + map['C'] = 'D' + + if choices[3] == cA: + map['D'] = 'A' + elif choices[3] == cB: + map['D'] = 'B' + elif choices[3] == cC: + map['D'] = 'C' + elif choices[3] == cD: + map['D'] = 'D' + else: + cA = d['OPTION-A'].replace("A. ", "") + cB = d['OPTION-B'].replace("B. ", "") + cC = d['OPTION-C'].replace("C. ", "") + cD = d['OPTION-D'].replace("D. ", "") + choices = [cA, cB, cC, cD] + random.shuffle(choices) + prompt = UserEvaluatePrompt4Choices_en.format(story=d['STORY'], question=d['QUESTION'], choice_a=choices[0], choice_b=choices[1], choice_c=choices[2], choice_d=choices[3]) + map = {"A": "", "B": "", "C": "", "D": ""} + + if choices[0] == cA: + map['A'] = 'A' + elif choices[0] == cB: + map['A'] = 'B' + elif choices[0] == cC: + map['A'] = 'C' + elif choices[0] == cD: + map['A'] = 'D' + + if choices[1] == cA: + map['B'] = 'A' + elif choices[1] == cB: + map['B'] = 'B' + elif choices[1] == cC: + map['B'] = 'C' + elif choices[1] == cD: + map['B'] = 'D' + + if choices[2] == cA: + map['C'] = 'A' + elif choices[2] == cB: + map['C'] = 'B' + elif choices[2] == cC: + map['C'] = 'C' + elif choices[2] == cD: + map['C'] = 'D' + + if choices[3] == cA: + map['D'] = 'A' + elif choices[3] == cB: + map['D'] = 'B' + elif choices[3] == cC: + map['D'] = 'C' + elif choices[3] == cD: + map['D'] = 'D' + return map, prompt + + +def format_prompt_2(d, args): + """ + Format prompt for 2-choice questions. + + Args: + d: Data dictionary containing the question and options + args: Arguments containing language setting + + Returns: + tuple: (mapping_dict, formatted_prompt) + """ + if args.language == 'zh': + cA = d['选项A'].replace("A. ", "") + cB = d['选项B'].replace("B. ", "") + choices = [cA, cB] + random.shuffle(choices) + prompt = UserEvaluatePrompt2Choices_zh.format(story=d['故事'], question=d['问题'], choice_a=choices[0], choice_b=choices[1]) + map = {"A": "", "B": "", "C": "", "D": ""} + if choices[0] == cA: + map['A'] = 'A' + elif choices[0] == cB: + map['A'] = 'B' + + if choices[1] == cA: + map['B'] = 'A' + elif choices[1] == cB: + map['B'] = 'B' + else: + cA = d['OPTION-A'].replace("A. ", "") + cB = d['OPTION-B'].replace("B. ", "") + choices = [cA, cB] + random.shuffle(choices) + prompt = UserEvaluatePrompt2Choices_en.format(story=d['STORY'], question=d['QUESTION'], choice_a=choices[0], choice_b=choices[1]) + map = {"A": "", "B": "", "C": "", "D": ""} + if choices[0] == cA: + map['A'] = 'A' + elif choices[0] == cB: + map['A'] = 'B' + + if choices[1] == cA: + map['B'] = 'A' + elif choices[1] == cB: + map['B'] = 'B' + + return map, prompt + + +def get_system_prompt(args): + """ + Get the appropriate system prompt based on language and CoT settings. + + Args: + args: Arguments containing language and cot settings + + Returns: + str: The formatted system prompt + """ + if args.language == "zh": + return SystemEvaluatePrompt_zh_cot if args.cot else SystemEvaluatePrompt_zh + else: + return SystemEvaluatePrompt_en_cot if args.cot else SystemEvaluatePrompt_en + + +def has_four_choices(d): + """ + Check if the data entry has four answer options. + + Args: + d: Data dictionary containing the question and options + + Returns: + bool: True if 4-choice options exist, False for 2-choice + """ + return ('选项C' in d and d.get('选项C')) or ('OPTION-C' in d and d.get('OPTION-C')) diff --git a/eval_huggingface.sh b/eval_huggingface.sh index fe41f52..374aa83 100644 --- a/eval_huggingface.sh +++ b/eval_huggingface.sh @@ -1,9 +1,7 @@ python3 run_huggingface.py \ --task "" \ --model_name "" \ - --api_base "" \ - --api_key "" \ --language "zh" \ --cot True \ --try_times 5 \ - --output_dir ./results \ \ No newline at end of file + --output_path ./results \ \ No newline at end of file diff --git a/get_results.py b/get_results.py index 53a1a1e..a28b4bf 100644 --- a/get_results.py +++ b/get_results.py @@ -3,6 +3,17 @@ import os def most_common_element(lst): + """ + Find the most common element in a list. + + Args: + lst: List of elements + + Returns: + The most common element, or None if list is empty + """ + if not lst: + return None element_freq = {} for item in lst: element_freq[item] = element_freq.get(item, 0) + 1 @@ -11,6 +22,17 @@ def most_common_element(lst): def extract_answer(text): + """ + Extract answer from model output. + + Args: + text: Model output text + + Returns: + Extracted answer (A, B, C, or D) + """ + if not text: + return "A" if "[[A]]" in text: return "A" elif "[[B]]" in text: @@ -45,39 +67,54 @@ def extract_answer(text): parser.add_argument("--input_path", type=str, default="") parser.add_argument("--try_times", type=int, default=5) args = parser.parse_args() - + files = os.listdir("./results") acc_per_task = {} cnt_per_task = {} acc_per_ability = {} cnt_per_ability = {} - + for file in files: with open(f"./results/{file}", "r", encoding='utf-8') as f: data = [json.loads(line) for line in f.readlines()] - - answers = ["" for _ in range(len(data) // args.try_times)] - preds = [[] for _ in range(len(data) // args.try_times)] - abilities = ["" for _ in range(len(data) // args.try_times)] + + # Find max index to properly size arrays + max_idx = max(d.get('idx', 0) for d in data) + 1 if data else 0 + + answers = [""] * max_idx + preds = [[] for _ in range(max_idx)] + abilities = [""] * max_idx + for d in data: - preds[d['idx']].append(d['map'][extract_answer(d['output'])]) - if answers[d['idx']] == "": - answers[d['idx']] = d['answer'] - - if abilities[d['idx']] == "": - abilities[d['idx']] = d['data']['能力\nABILITY'] - - - for i in range(len(data) // args.try_times): + idx = d.get('idx', 0) + extracted = extract_answer(d.get('output', '')) + # Safely get mapped answer, default to extracted if not in map + mapped = d.get('map', {}).get(extracted, extracted) + if mapped: # Only append non-empty values + preds[idx].append(mapped) + if answers[idx] == "": + answers[idx] = d.get('answer', '') + + if abilities[idx] == "": + abilities[idx] = d.get('data', {}).get('能力\nABILITY', '') + + # Count valid samples + valid_samples = sum(1 for i in range(max_idx) if answers[i]) + + for i in range(max_idx): + if not answers[i]: + continue # Skip empty samples + task = file.split("_")[0] - ability = abilities[i] + ability = abilities[i] if abilities[i] else "Unknown" cnt_per_task[task] = cnt_per_task.get(task, 0) + 1 cnt_per_ability[ability] = cnt_per_ability.get(ability, 0) + 1 - - if answers[i] == most_common_element(preds[i]): + # Get most common prediction + most_common = most_common_element(preds[i]) + if most_common and answers[i] == most_common: acc_per_task[task] = acc_per_task.get(task, 0) + 1 acc_per_ability[ability] = acc_per_ability.get(ability, 0) + 1 diff --git a/run_api.py b/run_api.py index 6318a10..9e9b47f 100644 --- a/run_api.py +++ b/run_api.py @@ -1,20 +1,28 @@ import json import random import time +import logging from tqdm import tqdm import multiprocessing from concurrent.futures import ThreadPoolExecutor -import openai +from openai import OpenAI import os import argparse from prompts import * +from common import format_prompt_4, format_prompt_2, get_system_prompt, has_four_choices + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) class ChatGPTProcessor: - def __init__(self): + def __init__(self, api_base="", api_key=""): self.lock = multiprocessing.Lock() - openai.api_key = "" - openai.api_base = "" + # Use provided values or fall back to environment variables + api_key = api_key or os.environ.get("OPENAI_API_KEY", "") + api_base = api_base or os.environ.get("OPENAI_API_BASE", "") + self.client = OpenAI(api_key=api_key, base_url=api_base) def read_jsonl(self, input_file): with open(input_file, 'r', encoding='utf-8') as f: @@ -22,159 +30,70 @@ def read_jsonl(self, input_file): return list(map(json.loads, tqdm(lines, desc='Reading...'))) def write_to_json(self, data, file_path): + """ + Thread-safe JSON file writing. + + Args: + data: Dictionary to write to JSON file + file_path: Path to the output file + """ with self.lock: + # Ensure directory exists + os.makedirs(os.path.dirname(file_path), exist_ok=True) + # Write to file with proper encoding with open(file_path, 'a', encoding='utf-8') as file: json.dump(data, file, ensure_ascii=False) file.write('\n') - def multiple_gpt(self, payload): - while True: + def multiple_gpt(self, payload, max_retries=5): + """ + Send request to API with retry logic and error logging. + + Args: + payload: Dictionary containing request parameters + max_retries: Maximum number of retry attempts (default: 5) + """ + retries = 0 + while retries < max_retries: try: - chat_completion = openai.ChatCompletion.create(model=payload['model'], temperature=0, messages=payload['messages']) + chat_completion = self.client.chat.completions.create( + model=payload['model'], + temperature=0, + messages=payload['messages'] + ) data = payload.copy() data['messages'] = payload['messages'] data['answer'] = payload['answer'] data['output'] = chat_completion.choices[0].message.content break + except openai.RateLimitError as e: + retries += 1 + wait_time = random.randint(2, 5) * retries + logger.warning(f"Rate limit error (attempt {retries}/{max_retries}): {e}. Waiting {wait_time}s...") + time.sleep(wait_time) + except openai.APIError as e: + retries += 1 + wait_time = random.randint(1, 3) + logger.warning(f"API error (attempt {retries}/{max_retries}): {e}. Waiting {wait_time}s...") + time.sleep(wait_time) + except openai.AuthenticationError as e: + logger.error(f"Authentication error: {e}. Please check your API key.") + raise except Exception as e: + retries += 1 + logger.error(f"Unexpected error (attempt {retries}/{max_retries}): {e}") time.sleep(random.randint(1, 3)) + else: + logger.error(f"Max retries ({max_retries}) exceeded for payload idx={payload['idx']}, number={payload['number']}") + # Write error result to maintain data continuity + data = payload.copy() + data['output'] = f"ERROR: Max retries exceeded" self.write_to_json(data, payload['save_path']) time.sleep(random.randint(1, 3)) -def format_prompt_4(d, args): - if args.language == 'zh': - cA = d['选项A'].replace("A. ", "") - cB = d['选项B'].replace("B. ", "") - cC = d['选项C'].replace("C. ", "") - cD = d['选项D'].replace("D. ", "") - choices = [cA, cB, cC, cD] - random.shuffle(choices) - prompt = UserEvaluatePrompt4Choices_zh.format(story=d['故事'], question=d['问题'], choice_a=choices[0], choice_b=choices[1], choice_c=choices[2], choice_d=choices[3]) - map = {"A": "", "B": "", "C": "", "D": ""} - - if choices[0] == cA: - map['A'] = 'A' - elif choices[0] == cB: - map['A'] = 'B' - elif choices[0] == cC: - map['A'] = 'C' - elif choices[0] == cD: - map['A'] = 'D' - - if choices[1] == cA: - map['B'] = 'A' - elif choices[1] == cB: - map['B'] = 'B' - elif choices[1] == cC: - map['B'] = 'C' - elif choices[1] == cD: - map['B'] = 'D' - - if choices[2] == cA: - map['C'] = 'A' - elif choices[2] == cB: - map['C'] = 'B' - elif choices[2] == cC: - map['C'] = 'C' - elif choices[2] == cD: - map['C'] = 'D' - - if choices[3] == cA: - map['D'] = 'A' - elif choices[3] == cB: - map['D'] = 'B' - elif choices[3] == cC: - map['D'] = 'C' - elif choices[3] == cD: - map['D'] = 'D' - else: - cA = d['OPTION-A'].replace("A. ", "") - cB = d['OPTION-B'].replace("B. ", "") - cC = d['OPTION-C'].replace("C. ", "") - cD = d['OPTION-D'].replace("D. ", "") - choices = [cA, cB, cC, cD] - random.shuffle(choices) - prompt = UserEvaluatePrompt4Choices_en.format(story=d['STORY'], question=d['QUESTION'], choice_a=choices[0], choice_b=choices[1], choice_c=choices[2], choice_d=choices[3]) - map = {"A": "", "B": "", "C": "", "D": ""} - - if choices[0] == cA: - map['A'] = 'A' - elif choices[0] == cB: - map['A'] = 'B' - elif choices[0] == cC: - map['A'] = 'C' - elif choices[0] == cD: - map['A'] = 'D' - - if choices[1] == cA: - map['B'] = 'A' - elif choices[1] == cB: - map['B'] = 'B' - elif choices[1] == cC: - map['B'] = 'C' - elif choices[1] == cD: - map['B'] = 'D' - - if choices[2] == cA: - map['C'] = 'A' - elif choices[2] == cB: - map['C'] = 'B' - elif choices[2] == cC: - map['C'] = 'C' - elif choices[2] == cD: - map['C'] = 'D' - - if choices[3] == cA: - map['D'] = 'A' - elif choices[3] == cB: - map['D'] = 'B' - elif choices[3] == cC: - map['D'] = 'C' - elif choices[3] == cD: - map['D'] = 'D' - return map, prompt - - -def format_prompt_2(d, args): - if args.language == 'zh': - cA = d['选项A'].replace("A. ", "") - cB = d['选项B'].replace("B. ", "") - choices = [cA, cB] - random.shuffle(choices) - prompt = UserEvaluatePrompt2Choices_zh.format(story=d['故事'], question=d['问题'], choice_a=choices[0], choice_b=choices[1]) - map = {"A": "", "B": "", "C": "", "D": ""} - if choices[0] == cA: - map['A'] = 'A' - elif choices[0] == cB: - map['A'] = 'B' - - if choices[1] == cA: - map['B'] = 'A' - elif choices[1] == cB: - map['B'] = 'B' - else: - cA = d['OPTION-A'].replace("A. ", "") - cB = d['OPTION-B'].replace("B. ", "") - choices = [cA, cB] - random.shuffle(choices) - prompt = UserEvaluatePrompt2Choices_en.format(story=d['STORY'], question=d['QUESTION'], choice_a=choices[0], choice_b=choices[1]) - map = {"A": "", "B": "", "C": "", "D": ""} - if choices[0] == cA: - map['A'] = 'A' - elif choices[0] == cB: - map['A'] = 'B' - - if choices[1] == cA: - map['B'] = 'A' - elif choices[1] == cB: - map['B'] = 'B' - - return map, prompt - - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--task", type=str, default="") @@ -189,7 +108,7 @@ def format_prompt_2(d, args): args = parser.parse_args() random.seed(args.seed) - processor = ChatGPTProcessor() + processor = ChatGPTProcessor(api_base=args.api_base, api_key=args.api_key) files = os.listdir("./data") if args.task != "": @@ -200,28 +119,20 @@ def format_prompt_2(d, args): try: with open(f"data/{file}", "r", encoding='utf-8') as f: data = [json.loads(line) for line in f.readlines()] - except: + except (FileNotFoundError, json.JSONDecodeError, IOError) as e: + print(f"Error reading {file}: {e}") continue payloads = [] for i, d in enumerate(data): for j in range(args.try_times): - if d['选项C'] != None: + # Check if 4-choice options exist (supports both Chinese and English keys) + if has_four_choices(d): maps, prompt = format_prompt_4(d, args) else: maps, prompt = format_prompt_2(d, args) - - system_prompt = "" - if args.language == "zh": - if args.cot == False: - system_prompt = SystemEvaluatePrompt_zh - else: - system_prompt = SystemEvaluatePrompt_zh_cot - else: - if args.cot == False: - system_prompt = SystemEvaluatePrompt_en - else: - system_prompt = SystemEvaluatePrompt_en_cot - + + system_prompt = get_system_prompt(args) + payload = { "model": args.model_name, "stream": False, @@ -240,7 +151,10 @@ def format_prompt_2(d, args): } payloads.append(payload) - + + # Execute API calls in parallel and wait for completion with ThreadPoolExecutor(max_workers=32) as executor: - for payload in payloads: - executor.submit(processor.multiple_gpt, payload) + futures = [executor.submit(processor.multiple_gpt, payload) for payload in payloads] + # Wait for all tasks to complete + for future in futures: + future.result() diff --git a/run_huggingface.py b/run_huggingface.py index ac4e9bf..c1a99e0 100644 --- a/run_huggingface.py +++ b/run_huggingface.py @@ -5,137 +5,7 @@ from prompts import * from tqdm import tqdm import os - - -def format_prompt_4(d, args): - if args.language == 'zh': - cA = d['选项A'].replace("A. ", "") - cB = d['选项B'].replace("B. ", "") - cC = d['选项C'].replace("C. ", "") - cD = d['选项D'].replace("D. ", "") - choices = [cA, cB, cC, cD] - random.shuffle(choices) - prompt = UserEvaluatePrompt4Choices_zh.format(story=d['故事'], question=d['问题'], choice_a=choices[0], choice_b=choices[1], choice_c=choices[2], choice_d=choices[3]) - map = {"A": "", "B": "", "C": "", "D": ""} - - if choices[0] == cA: - map['A'] = 'A' - elif choices[0] == cB: - map['A'] = 'B' - elif choices[0] == cC: - map['A'] = 'C' - elif choices[0] == cD: - map['A'] = 'D' - - if choices[1] == cA: - map['B'] = 'A' - elif choices[1] == cB: - map['B'] = 'B' - elif choices[1] == cC: - map['B'] = 'C' - elif choices[1] == cD: - map['B'] = 'D' - - if choices[2] == cA: - map['C'] = 'A' - elif choices[2] == cB: - map['C'] = 'B' - elif choices[2] == cC: - map['C'] = 'C' - elif choices[2] == cD: - map['C'] = 'D' - - if choices[3] == cA: - map['D'] = 'A' - elif choices[3] == cB: - map['D'] = 'B' - elif choices[3] == cC: - map['D'] = 'C' - elif choices[3] == cD: - map['D'] = 'D' - else: - cA = d['OPTION-A'].replace("A. ", "") - cB = d['OPTION-B'].replace("B. ", "") - cC = d['OPTION-C'].replace("C. ", "") - cD = d['OPTION-D'].replace("D. ", "") - choices = [cA, cB, cC, cD] - random.shuffle(choices) - prompt = UserEvaluatePrompt4Choices_en.format(story=d['STORY'], question=d['QUESTION'], choice_a=choices[0], choice_b=choices[1], choice_c=choices[2], choice_d=choices[3]) - map = {"A": "", "B": "", "C": "", "D": ""} - - if choices[0] == cA: - map['A'] = 'A' - elif choices[0] == cB: - map['A'] = 'B' - elif choices[0] == cC: - map['A'] = 'C' - elif choices[0] == cD: - map['A'] = 'D' - - if choices[1] == cA: - map['B'] = 'A' - elif choices[1] == cB: - map['B'] = 'B' - elif choices[1] == cC: - map['B'] = 'C' - elif choices[1] == cD: - map['B'] = 'D' - - if choices[2] == cA: - map['C'] = 'A' - elif choices[2] == cB: - map['C'] = 'B' - elif choices[2] == cC: - map['C'] = 'C' - elif choices[2] == cD: - map['C'] = 'D' - - if choices[3] == cA: - map['D'] = 'A' - elif choices[3] == cB: - map['D'] = 'B' - elif choices[3] == cC: - map['D'] = 'C' - elif choices[3] == cD: - map['D'] = 'D' - return map, prompt - - -def format_prompt_2(d, args): - if args.language == 'zh': - cA = d['选项A'].replace("A. ", "") - cB = d['选项B'].replace("B. ", "") - choices = [cA, cB] - random.shuffle(choices) - prompt = UserEvaluatePrompt2Choices_zh.format(story=d['故事'], question=d['问题'], choice_a=choices[0], choice_b=choices[1]) - map = {"A": "", "B": "", "C": "", "D": ""} - if choices[0] == cA: - map['A'] = 'A' - elif choices[0] == cB: - map['A'] = 'B' - - if choices[1] == cA: - map['B'] = 'A' - elif choices[1] == cB: - map['B'] = 'B' - else: - cA = d['OPTION-A'].replace("A. ", "") - cB = d['OPTION-B'].replace("B. ", "") - choices = [cA, cB] - random.shuffle(choices) - prompt = UserEvaluatePrompt2Choices_en.format(story=d['STORY'], question=d['QUESTION'], choice_a=choices[0], choice_b=choices[1]) - map = {"A": "", "B": "", "C": "", "D": ""} - if choices[0] == cA: - map['A'] = 'A' - elif choices[0] == cB: - map['A'] = 'B' - - if choices[1] == cA: - map['B'] = 'A' - elif choices[1] == cB: - map['B'] = 'B' - - return map, prompt +from common import format_prompt_4, format_prompt_2, get_system_prompt, has_four_choices if __name__ == "__main__": @@ -146,12 +16,19 @@ def format_prompt_2(d, args): parser.add_argument("--try_times", type=int, default=5) parser.add_argument("--cot", type=bool, default=False) parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--trust_remote_code", type=bool, default=False, + help="Allow trusting remote code. Only enable for trusted model sources.") args = parser.parse_args() random.seed(args.seed) - tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=True).half().cuda() + # Security notice: trust_remote_code allows execution of arbitrary code from the model repository + # Only enable this for trusted model sources (e.g., HuggingFace hub models you trust) + if args.trust_remote_code: + print("WARNING: trust_remote_code=True - Only use this with trusted model sources!") + + tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=args.trust_remote_code) + model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=args.trust_remote_code).half().cuda() model_name = args.model_name.split("/")[-1] @@ -167,22 +44,13 @@ def format_prompt_2(d, args): print(file) for i, d in tqdm(enumerate(data[:10])): for j in range(args.try_times): - if d['选项C'] != None: + # Check if 4-choice options exist (supports both Chinese and English keys) + if has_four_choices(d): maps, prompt = format_prompt_4(d, args) else: maps, prompt = format_prompt_2(d, args) - - system_prompt = "" - if args.language == "zh": - if args.cot == False: - system_prompt = SystemEvaluatePrompt_zh - else: - system_prompt = SystemEvaluatePrompt_zh_cot - else: - if args.cot == False: - system_prompt = SystemEvaluatePrompt_en - else: - system_prompt = SystemEvaluatePrompt_en_cot + + system_prompt = get_system_prompt(args) messages = [ {"role": "system", "content": system_prompt},