diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..726fb45
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+claude.md
diff --git a/common.py b/common.py
new file mode 100644
index 0000000..4459ba7
--- /dev/null
+++ b/common.py
@@ -0,0 +1,186 @@
+"""
+Common utility functions for ToMBench evaluation scripts.
+This module contains shared functions used by both API-based and HuggingFace evaluation.
+"""
+import random
+from prompts import *
+
+
+def format_prompt_4(d, args):
+    """
+    Format prompt for 4-choice questions.
+
+    Args:
+        d: Data dictionary containing the question and options
+        args: Arguments containing language setting
+
+    Returns:
+        tuple: (mapping_dict, formatted_prompt)
+    """
+    if args.language == 'zh':
+        cA = d['选项A'].replace("A. ", "")
+        cB = d['选项B'].replace("B. ", "")
+        cC = d['选项C'].replace("C. ", "")
+        cD = d['选项D'].replace("D. ", "")
+        choices = [cA, cB, cC, cD]
+        random.shuffle(choices)
+        prompt = UserEvaluatePrompt4Choices_zh.format(story=d['故事'], question=d['问题'], choice_a=choices[0], choice_b=choices[1], choice_c=choices[2], choice_d=choices[3])
+        map = {"A": "", "B": "", "C": "", "D": ""}
+
+        if choices[0] == cA:
+            map['A'] = 'A'
+        elif choices[0] == cB:
+            map['A'] = 'B'
+        elif choices[0] == cC:
+            map['A'] = 'C'
+        elif choices[0] == cD:
+            map['A'] = 'D'
+
+        if choices[1] == cA:
+            map['B'] = 'A'
+        elif choices[1] == cB:
+            map['B'] = 'B'
+        elif choices[1] == cC:
+            map['B'] = 'C'
+        elif choices[1] == cD:
+            map['B'] = 'D'
+
+        if choices[2] == cA:
+            map['C'] = 'A'
+        elif choices[2] == cB:
+            map['C'] = 'B'
+        elif choices[2] == cC:
+            map['C'] = 'C'
+        elif choices[2] == cD:
+            map['C'] = 'D'
+
+        if choices[3] == cA:
+            map['D'] = 'A'
+        elif choices[3] == cB:
+            map['D'] = 'B'
+        elif choices[3] == cC:
+            map['D'] = 'C'
+        elif choices[3] == cD:
+            map['D'] = 'D'
+    else:
+        cA = d['OPTION-A'].replace("A. ", "")
+        cB = d['OPTION-B'].replace("B. ", "")
+        cC = d['OPTION-C'].replace("C. ", "")
+        cD = d['OPTION-D'].replace("D. ", "")
+        choices = [cA, cB, cC, cD]
+        random.shuffle(choices)
+        prompt = UserEvaluatePrompt4Choices_en.format(story=d['STORY'], question=d['QUESTION'], choice_a=choices[0], choice_b=choices[1], choice_c=choices[2], choice_d=choices[3])
+        map = {"A": "", "B": "", "C": "", "D": ""}
+
+        if choices[0] == cA:
+            map['A'] = 'A'
+        elif choices[0] == cB:
+            map['A'] = 'B'
+        elif choices[0] == cC:
+            map['A'] = 'C'
+        elif choices[0] == cD:
+            map['A'] = 'D'
+
+        if choices[1] == cA:
+            map['B'] = 'A'
+        elif choices[1] == cB:
+            map['B'] = 'B'
+        elif choices[1] == cC:
+            map['B'] = 'C'
+        elif choices[1] == cD:
+            map['B'] = 'D'
+
+        if choices[2] == cA:
+            map['C'] = 'A'
+        elif choices[2] == cB:
+            map['C'] = 'B'
+        elif choices[2] == cC:
+            map['C'] = 'C'
+        elif choices[2] == cD:
+            map['C'] = 'D'
+
+        if choices[3] == cA:
+            map['D'] = 'A'
+        elif choices[3] == cB:
+            map['D'] = 'B'
+        elif choices[3] == cC:
+            map['D'] = 'C'
+        elif choices[3] == cD:
+            map['D'] = 'D'
+    return map, prompt
+
+
+def format_prompt_2(d, args):
+    """
+    Format prompt for 2-choice questions.
+
+    Args:
+        d: Data dictionary containing the question and options
+        args: Arguments containing language setting
+
+    Returns:
+        tuple: (mapping_dict, formatted_prompt)
+    """
+    if args.language == 'zh':
+        cA = d['选项A'].replace("A. ", "")
+        cB = d['选项B'].replace("B. ", "")
+        choices = [cA, cB]
+        random.shuffle(choices)
+        prompt = UserEvaluatePrompt2Choices_zh.format(story=d['故事'], question=d['问题'], choice_a=choices[0], choice_b=choices[1])
+        map = {"A": "", "B": "", "C": "", "D": ""}
+        if choices[0] == cA:
+            map['A'] = 'A'
+        elif choices[0] == cB:
+            map['A'] = 'B'
+
+        if choices[1] == cA:
+            map['B'] = 'A'
+        elif choices[1] == cB:
+            map['B'] = 'B'
+    else:
+        cA = d['OPTION-A'].replace("A. ", "")
+        cB = d['OPTION-B'].replace("B. ", "")
+        choices = [cA, cB]
+        random.shuffle(choices)
+        prompt = UserEvaluatePrompt2Choices_en.format(story=d['STORY'], question=d['QUESTION'], choice_a=choices[0], choice_b=choices[1])
+        map = {"A": "", "B": "", "C": "", "D": ""}
+        if choices[0] == cA:
+            map['A'] = 'A'
+        elif choices[0] == cB:
+            map['A'] = 'B'
+
+        if choices[1] == cA:
+            map['B'] = 'A'
+        elif choices[1] == cB:
+            map['B'] = 'B'
+
+    return map, prompt
+
+
+def get_system_prompt(args):
+    """
+    Get the appropriate system prompt based on language and CoT settings.
+
+    Args:
+        args: Arguments containing language and cot settings
+
+    Returns:
+        str: The formatted system prompt
+    """
+    if args.language == "zh":
+        return SystemEvaluatePrompt_zh_cot if args.cot else SystemEvaluatePrompt_zh
+    else:
+        return SystemEvaluatePrompt_en_cot if args.cot else SystemEvaluatePrompt_en
+
+
+def has_four_choices(d):
+    """
+    Check if the data entry has four answer options.
+
+    Args:
+        d: Data dictionary containing the question and options
+
+    Returns:
+        bool: True if 4-choice options exist, False for 2-choice
+    """
+    return ('选项C' in d and d.get('选项C')) or ('OPTION-C' in d and d.get('OPTION-C'))
diff --git a/eval_huggingface.sh b/eval_huggingface.sh
index fe41f52..374aa83 100644
--- a/eval_huggingface.sh
+++ b/eval_huggingface.sh
@@ -1,9 +1,7 @@
 python3 run_huggingface.py \
     --task "" \
     --model_name "" \
-    --api_base "" \
-    --api_key "" \
     --language "zh" \
     --cot True \
     --try_times 5 \
-    --output_dir ./results \
\ No newline at end of file
+    --output_path ./results \
\ No newline at end of file
diff --git a/get_results.py b/get_results.py
index 53a1a1e..a28b4bf 100644
--- a/get_results.py
+++ b/get_results.py
@@ -3,6 +3,17 @@
 import os
 
 def most_common_element(lst):
+    """
+    Find the most common element in a list.
+
+    Args:
+        lst: List of elements
+
+    Returns:
+        The most common element, or None if list is empty
+    """
+    if not lst:
+        return None
     element_freq = {}
     for item in lst:
         element_freq[item] = element_freq.get(item, 0) + 1
@@ -11,6 +22,17 @@ def most_common_element(lst):
 
 
 def extract_answer(text):
+    """
+    Extract answer from model output.
+
+    Args:
+        text: Model output text
+
+    Returns:
+        Extracted answer (A, B, C, or D)
+    """
+    if not text:
+        return "A"
     if "[[A]]" in text:
         return "A"
     elif "[[B]]" in text:
@@ -45,39 +67,54 @@ def extract_answer(text):
     parser.add_argument("--input_path", type=str, default="")
     parser.add_argument("--try_times", type=int, default=5)
     args = parser.parse_args()
-    
+
     files = os.listdir("./results")
     acc_per_task = {}
     cnt_per_task = {}
 
     acc_per_ability = {}
     cnt_per_ability = {}
-    
+
     for file in files:
         with open(f"./results/{file}", "r", encoding='utf-8') as f:
             data = [json.loads(line) for line in f.readlines()]
-        
-        answers = ["" for _ in range(len(data) // args.try_times)]
-        preds = [[] for _ in range(len(data) // args.try_times)]
-        abilities = ["" for _ in range(len(data) // args.try_times)]
+
+        # Find max index to properly size arrays
+        max_idx = max(d.get('idx', 0) for d in data) + 1 if data else 0
+
+        answers = [""] * max_idx
+        preds = [[] for _ in range(max_idx)]
+        abilities = [""] * max_idx
+
         for d in data:
-            preds[d['idx']].append(d['map'][extract_answer(d['output'])])
-            if answers[d['idx']] == "":
-                answers[d['idx']] = d['answer']
-            
-            if abilities[d['idx']] == "":
-                abilities[d['idx']] = d['data']['能力\nABILITY']
-        
-
-        for i in range(len(data) // args.try_times):
+            idx = d.get('idx', 0)
+            extracted = extract_answer(d.get('output', ''))
+            # Safely get mapped answer, default to extracted if not in map
+            mapped = d.get('map', {}).get(extracted, extracted)
+            if mapped:  # Only append non-empty values
+                preds[idx].append(mapped)
+            if answers[idx] == "":
+                answers[idx] = d.get('answer', '')
+
+            if abilities[idx] == "":
+                abilities[idx] = d.get('data', {}).get('能力\nABILITY', '')
+
+        # Count valid samples
+        valid_samples = sum(1 for i in range(max_idx) if answers[i])
+
+        for i in range(max_idx):
+            if not answers[i]:
+                continue  # Skip empty samples
+
             task = file.split("_")[0]
-            ability = abilities[i]
+            ability = abilities[i] if abilities[i] else "Unknown"
 
             cnt_per_task[task] = cnt_per_task.get(task, 0) + 1
             cnt_per_ability[ability] = cnt_per_ability.get(ability, 0) + 1
 
-
-            if answers[i] == most_common_element(preds[i]):
+            # Get most common prediction
+            most_common = most_common_element(preds[i])
+            if most_common and answers[i] == most_common:
                 acc_per_task[task] = acc_per_task.get(task, 0) + 1
                 acc_per_ability[ability] = acc_per_ability.get(ability, 0) + 1
     
diff --git a/run_api.py b/run_api.py
index 6318a10..9e9b47f 100644
--- a/run_api.py
+++ b/run_api.py
@@ -1,20 +1,28 @@
 import json
 import random
 import time
+import logging
 from tqdm import tqdm
 import multiprocessing
 from concurrent.futures import ThreadPoolExecutor
-import openai
+from openai import OpenAI
 import os
 import argparse
 from prompts import *
+from common import format_prompt_4, format_prompt_2, get_system_prompt, has_four_choices
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
 
 
 class ChatGPTProcessor:
-    def __init__(self):
+    def __init__(self, api_base="", api_key=""):
         self.lock = multiprocessing.Lock()
-        openai.api_key = ""
-        openai.api_base = ""
+        # Use provided values or fall back to environment variables
+        api_key = api_key or os.environ.get("OPENAI_API_KEY", "")
+        api_base = api_base or os.environ.get("OPENAI_API_BASE", "")
+        self.client = OpenAI(api_key=api_key, base_url=api_base)
 
     def read_jsonl(self, input_file):
         with open(input_file, 'r', encoding='utf-8') as f:
@@ -22,159 +30,70 @@ def read_jsonl(self, input_file):
         return list(map(json.loads, tqdm(lines, desc='Reading...')))
 
     def write_to_json(self, data, file_path):
+        """
+        Thread-safe JSON file writing.
+
+        Args:
+            data: Dictionary to write to JSON file
+            file_path: Path to the output file
+        """
         with self.lock:
+            # Ensure directory exists
+            os.makedirs(os.path.dirname(file_path), exist_ok=True)
+            # Write to file with proper encoding
             with open(file_path, 'a', encoding='utf-8') as file:
                 json.dump(data, file, ensure_ascii=False)
                 file.write('\n')
 
-    def multiple_gpt(self, payload):
-        while True:
+    def multiple_gpt(self, payload, max_retries=5):
+        """
+        Send request to API with retry logic and error logging.
+
+        Args:
+            payload: Dictionary containing request parameters
+            max_retries: Maximum number of retry attempts (default: 5)
+        """
+        retries = 0
+        while retries < max_retries:
             try:
-                chat_completion = openai.ChatCompletion.create(model=payload['model'], temperature=0, messages=payload['messages'])
+                chat_completion = self.client.chat.completions.create(
+                    model=payload['model'],
+                    temperature=0,
+                    messages=payload['messages']
+                )
                 data = payload.copy()
                 data['messages'] = payload['messages']
                 data['answer'] = payload['answer']
                 data['output'] = chat_completion.choices[0].message.content
                 break
+            except openai.RateLimitError as e:
+                retries += 1
+                wait_time = random.randint(2, 5) * retries
+                logger.warning(f"Rate limit error (attempt {retries}/{max_retries}): {e}. Waiting {wait_time}s...")
+                time.sleep(wait_time)
+            except openai.APIError as e:
+                retries += 1
+                wait_time = random.randint(1, 3)
+                logger.warning(f"API error (attempt {retries}/{max_retries}): {e}. Waiting {wait_time}s...")
+                time.sleep(wait_time)
+            except openai.AuthenticationError as e:
+                logger.error(f"Authentication error: {e}. Please check your API key.")
+                raise
             except Exception as e:
+                retries += 1
+                logger.error(f"Unexpected error (attempt {retries}/{max_retries}): {e}")
                 time.sleep(random.randint(1, 3))
+        else:
+            logger.error(f"Max retries ({max_retries}) exceeded for payload idx={payload['idx']}, number={payload['number']}")
+            # Write error result to maintain data continuity
+            data = payload.copy()
+            data['output'] = f"ERROR: Max retries exceeded"
 
         self.write_to_json(data, payload['save_path'])
         time.sleep(random.randint(1, 3))
 
 
 
-def format_prompt_4(d, args):
-    if args.language == 'zh':
-        cA = d['选项A'].replace("A. ", "")
-        cB = d['选项B'].replace("B. ", "")
-        cC = d['选项C'].replace("C. ", "")
-        cD = d['选项D'].replace("D. ", "")
-        choices = [cA, cB, cC, cD]
-        random.shuffle(choices)
-        prompt = UserEvaluatePrompt4Choices_zh.format(story=d['故事'], question=d['问题'], choice_a=choices[0], choice_b=choices[1], choice_c=choices[2], choice_d=choices[3])
-        map = {"A": "", "B": "", "C": "", "D": ""}
-
-        if choices[0] == cA:
-            map['A'] = 'A'
-        elif choices[0] == cB:
-            map['A'] = 'B'
-        elif choices[0] == cC:
-            map['A'] = 'C'
-        elif choices[0] == cD:
-            map['A'] = 'D'
-        
-        if choices[1] == cA:
-            map['B'] = 'A'
-        elif choices[1] == cB:
-            map['B'] = 'B'
-        elif choices[1] == cC:
-            map['B'] = 'C'
-        elif choices[1] == cD:
-            map['B'] = 'D'
-
-        if choices[2] == cA:
-            map['C'] = 'A'
-        elif choices[2] == cB:
-            map['C'] = 'B'
-        elif choices[2] == cC:
-            map['C'] = 'C'
-        elif choices[2] == cD:
-            map['C'] = 'D'
-        
-        if choices[3] == cA:
-            map['D'] = 'A'
-        elif choices[3] == cB:
-            map['D'] = 'B'
-        elif choices[3] == cC:
-            map['D'] = 'C'
-        elif choices[3] == cD:
-            map['D'] = 'D'
-    else:
-        cA = d['OPTION-A'].replace("A. ", "")
-        cB = d['OPTION-B'].replace("B. ", "")
-        cC = d['OPTION-C'].replace("C. ", "")
-        cD = d['OPTION-D'].replace("D. ", "")
-        choices = [cA, cB, cC, cD]
-        random.shuffle(choices)
-        prompt = UserEvaluatePrompt4Choices_en.format(story=d['STORY'], question=d['QUESTION'], choice_a=choices[0], choice_b=choices[1], choice_c=choices[2], choice_d=choices[3])
-        map = {"A": "", "B": "", "C": "", "D": ""}
-
-        if choices[0] == cA:
-            map['A'] = 'A'
-        elif choices[0] == cB:
-            map['A'] = 'B'
-        elif choices[0] == cC:
-            map['A'] = 'C'
-        elif choices[0] == cD:
-            map['A'] = 'D'
-        
-        if choices[1] == cA:
-            map['B'] = 'A'
-        elif choices[1] == cB:
-            map['B'] = 'B'
-        elif choices[1] == cC:
-            map['B'] = 'C'
-        elif choices[1] == cD:
-            map['B'] = 'D'
-
-        if choices[2] == cA:
-            map['C'] = 'A'
-        elif choices[2] == cB:
-            map['C'] = 'B'
-        elif choices[2] == cC:
-            map['C'] = 'C'
-        elif choices[2] == cD:
-            map['C'] = 'D'
-        
-        if choices[3] == cA:
-            map['D'] = 'A'
-        elif choices[3] == cB:
-            map['D'] = 'B'
-        elif choices[3] == cC:
-            map['D'] = 'C'
-        elif choices[3] == cD:
-            map['D'] = 'D'
-    return map, prompt
-
-
-def format_prompt_2(d, args):
-    if args.language == 'zh':
-        cA = d['选项A'].replace("A. ", "")
-        cB = d['选项B'].replace("B. ", "")
-        choices = [cA, cB]
-        random.shuffle(choices)
-        prompt = UserEvaluatePrompt2Choices_zh.format(story=d['故事'], question=d['问题'], choice_a=choices[0], choice_b=choices[1])
-        map = {"A": "", "B": "", "C": "", "D": ""}
-        if choices[0] == cA:
-            map['A'] = 'A'
-        elif choices[0] == cB:
-            map['A'] = 'B'
-        
-        if choices[1] == cA:
-            map['B'] = 'A'
-        elif choices[1] == cB:
-            map['B'] = 'B'
-    else:
-        cA = d['OPTION-A'].replace("A. ", "")
-        cB = d['OPTION-B'].replace("B. ", "")
-        choices = [cA, cB]
-        random.shuffle(choices)
-        prompt = UserEvaluatePrompt2Choices_en.format(story=d['STORY'], question=d['QUESTION'], choice_a=choices[0], choice_b=choices[1])
-        map = {"A": "", "B": "", "C": "", "D": ""}
-        if choices[0] == cA:
-            map['A'] = 'A'
-        elif choices[0] == cB:
-            map['A'] = 'B'
-        
-        if choices[1] == cA:
-            map['B'] = 'A'
-        elif choices[1] == cB:
-            map['B'] = 'B'
-
-    return map, prompt
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--task", type=str, default="")
@@ -189,7 +108,7 @@ def format_prompt_2(d, args):
     args = parser.parse_args()
 
     random.seed(args.seed)
-    processor = ChatGPTProcessor()
+    processor = ChatGPTProcessor(api_base=args.api_base, api_key=args.api_key)
 
     files = os.listdir("./data")
     if args.task != "":
@@ -200,28 +119,20 @@ def format_prompt_2(d, args):
         try:
             with open(f"data/{file}", "r", encoding='utf-8') as f:
                 data = [json.loads(line) for line in f.readlines()]
-        except:
+        except (FileNotFoundError, json.JSONDecodeError, IOError) as e:
+            print(f"Error reading {file}: {e}")
             continue
         payloads = []
         for i, d in enumerate(data):
             for j in range(args.try_times):
-                if d['选项C'] != None:
+                # Check if 4-choice options exist (supports both Chinese and English keys)
+                if has_four_choices(d):
                     maps, prompt = format_prompt_4(d, args)
                 else:
                     maps, prompt = format_prompt_2(d, args)
-                
-                system_prompt = ""
-                if args.language == "zh":
-                    if args.cot == False:
-                        system_prompt = SystemEvaluatePrompt_zh
-                    else:
-                        system_prompt = SystemEvaluatePrompt_zh_cot
-                else:
-                    if args.cot == False:
-                        system_prompt = SystemEvaluatePrompt_en
-                    else:
-                        system_prompt = SystemEvaluatePrompt_en_cot
-                
+
+                system_prompt = get_system_prompt(args)
+
                 payload = {
                     "model": args.model_name,
                     "stream": False,
@@ -240,7 +151,10 @@ def format_prompt_2(d, args):
                 }
                 payloads.append(payload)
 
-                
+
+        # Execute API calls in parallel and wait for completion
         with ThreadPoolExecutor(max_workers=32) as executor:
-            for payload in payloads:
-                executor.submit(processor.multiple_gpt, payload)
+            futures = [executor.submit(processor.multiple_gpt, payload) for payload in payloads]
+            # Wait for all tasks to complete
+            for future in futures:
+                future.result()
diff --git a/run_huggingface.py b/run_huggingface.py
index ac4e9bf..c1a99e0 100644
--- a/run_huggingface.py
+++ b/run_huggingface.py
@@ -5,137 +5,7 @@
 from prompts import *
 from tqdm import tqdm
 import os
-
-
-def format_prompt_4(d, args):
-    if args.language == 'zh':
-        cA = d['选项A'].replace("A. ", "")
-        cB = d['选项B'].replace("B. ", "")
-        cC = d['选项C'].replace("C. ", "")
-        cD = d['选项D'].replace("D. ", "")
-        choices = [cA, cB, cC, cD]
-        random.shuffle(choices)
-        prompt = UserEvaluatePrompt4Choices_zh.format(story=d['故事'], question=d['问题'], choice_a=choices[0], choice_b=choices[1], choice_c=choices[2], choice_d=choices[3])
-        map = {"A": "", "B": "", "C": "", "D": ""}
-
-        if choices[0] == cA:
-            map['A'] = 'A'
-        elif choices[0] == cB:
-            map['A'] = 'B'
-        elif choices[0] == cC:
-            map['A'] = 'C'
-        elif choices[0] == cD:
-            map['A'] = 'D'
-        
-        if choices[1] == cA:
-            map['B'] = 'A'
-        elif choices[1] == cB:
-            map['B'] = 'B'
-        elif choices[1] == cC:
-            map['B'] = 'C'
-        elif choices[1] == cD:
-            map['B'] = 'D'
-
-        if choices[2] == cA:
-            map['C'] = 'A'
-        elif choices[2] == cB:
-            map['C'] = 'B'
-        elif choices[2] == cC:
-            map['C'] = 'C'
-        elif choices[2] == cD:
-            map['C'] = 'D'
-        
-        if choices[3] == cA:
-            map['D'] = 'A'
-        elif choices[3] == cB:
-            map['D'] = 'B'
-        elif choices[3] == cC:
-            map['D'] = 'C'
-        elif choices[3] == cD:
-            map['D'] = 'D'
-    else:
-        cA = d['OPTION-A'].replace("A. ", "")
-        cB = d['OPTION-B'].replace("B. ", "")
-        cC = d['OPTION-C'].replace("C. ", "")
-        cD = d['OPTION-D'].replace("D. ", "")
-        choices = [cA, cB, cC, cD]
-        random.shuffle(choices)
-        prompt = UserEvaluatePrompt4Choices_en.format(story=d['STORY'], question=d['QUESTION'], choice_a=choices[0], choice_b=choices[1], choice_c=choices[2], choice_d=choices[3])
-        map = {"A": "", "B": "", "C": "", "D": ""}
-
-        if choices[0] == cA:
-            map['A'] = 'A'
-        elif choices[0] == cB:
-            map['A'] = 'B'
-        elif choices[0] == cC:
-            map['A'] = 'C'
-        elif choices[0] == cD:
-            map['A'] = 'D'
-        
-        if choices[1] == cA:
-            map['B'] = 'A'
-        elif choices[1] == cB:
-            map['B'] = 'B'
-        elif choices[1] == cC:
-            map['B'] = 'C'
-        elif choices[1] == cD:
-            map['B'] = 'D'
-
-        if choices[2] == cA:
-            map['C'] = 'A'
-        elif choices[2] == cB:
-            map['C'] = 'B'
-        elif choices[2] == cC:
-            map['C'] = 'C'
-        elif choices[2] == cD:
-            map['C'] = 'D'
-        
-        if choices[3] == cA:
-            map['D'] = 'A'
-        elif choices[3] == cB:
-            map['D'] = 'B'
-        elif choices[3] == cC:
-            map['D'] = 'C'
-        elif choices[3] == cD:
-            map['D'] = 'D'
-    return map, prompt
-
-
-def format_prompt_2(d, args):
-    if args.language == 'zh':
-        cA = d['选项A'].replace("A. ", "")
-        cB = d['选项B'].replace("B. ", "")
-        choices = [cA, cB]
-        random.shuffle(choices)
-        prompt = UserEvaluatePrompt2Choices_zh.format(story=d['故事'], question=d['问题'], choice_a=choices[0], choice_b=choices[1])
-        map = {"A": "", "B": "", "C": "", "D": ""}
-        if choices[0] == cA:
-            map['A'] = 'A'
-        elif choices[0] == cB:
-            map['A'] = 'B'
-        
-        if choices[1] == cA:
-            map['B'] = 'A'
-        elif choices[1] == cB:
-            map['B'] = 'B'
-    else:
-        cA = d['OPTION-A'].replace("A. ", "")
-        cB = d['OPTION-B'].replace("B. ", "")
-        choices = [cA, cB]
-        random.shuffle(choices)
-        prompt = UserEvaluatePrompt2Choices_en.format(story=d['STORY'], question=d['QUESTION'], choice_a=choices[0], choice_b=choices[1])
-        map = {"A": "", "B": "", "C": "", "D": ""}
-        if choices[0] == cA:
-            map['A'] = 'A'
-        elif choices[0] == cB:
-            map['A'] = 'B'
-        
-        if choices[1] == cA:
-            map['B'] = 'A'
-        elif choices[1] == cB:
-            map['B'] = 'B'
-
-    return map, prompt
+from common import format_prompt_4, format_prompt_2, get_system_prompt, has_four_choices
 
 
 if __name__ == "__main__":
@@ -146,12 +16,19 @@ def format_prompt_2(d, args):
     parser.add_argument("--try_times", type=int, default=5)
     parser.add_argument("--cot", type=bool, default=False)
     parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--trust_remote_code", type=bool, default=False,
+                        help="Allow trusting remote code. Only enable for trusted model sources.")
     args = parser.parse_args()
 
     random.seed(args.seed)
 
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=True).half().cuda()
+    # Security notice: trust_remote_code allows execution of arbitrary code from the model repository
+    # Only enable this for trusted model sources (e.g., HuggingFace hub models you trust)
+    if args.trust_remote_code:
+        print("WARNING: trust_remote_code=True - Only use this with trusted model sources!")
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=args.trust_remote_code)
+    model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=args.trust_remote_code).half().cuda()
 
     model_name = args.model_name.split("/")[-1]
     
@@ -167,22 +44,13 @@ def format_prompt_2(d, args):
         print(file)
         for i, d in tqdm(enumerate(data[:10])):
             for j in range(args.try_times):
-                if d['选项C'] != None:
+                # Check if 4-choice options exist (supports both Chinese and English keys)
+                if has_four_choices(d):
                     maps, prompt = format_prompt_4(d, args)
                 else:
                     maps, prompt = format_prompt_2(d, args)
-                
-                system_prompt = ""
-                if args.language == "zh":
-                    if args.cot == False:
-                        system_prompt = SystemEvaluatePrompt_zh
-                    else:
-                        system_prompt = SystemEvaluatePrompt_zh_cot
-                else:
-                    if args.cot == False:
-                        system_prompt = SystemEvaluatePrompt_en
-                    else:
-                        system_prompt = SystemEvaluatePrompt_en_cot
+
+                system_prompt = get_system_prompt(args)
 
                 messages = [
                     {"role": "system", "content": system_prompt},