diff --git a/applications/Chat/evaluate/gpt_evaluate.py b/applications/Chat/evaluate/gpt_evaluate.py
new file mode 100644
index 000000000000..119054299517
--- /dev/null
+++ b/applications/Chat/evaluate/gpt_evaluate.py
@@ -0,0 +1,530 @@
+import concurrent.futures
+import os
+import random
+import re
+import time
+from copy import deepcopy
+from typing import Any, Dict, List
+
+import matplotlib.pyplot as plt
+import numpy as np
+import openai
+import pandas as pd
+import seaborn as sns
+import tqdm
+from utils import jdump, jload
+
+
+def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]:
+    """
+    Get evaluation from GPT-4.
+
+    Args:
+        sys_prompt: prompt for the system.
+        user_prompt: prompt for the user.
+        id: id of the answers for comparison.
+        max_tokens: the maximum number of tokens to generate in the chat completion.
+
+    Returns:
+        An evaluation of one comparison.
+    """
+
+    MAX_API_RETRY = 3
+    for _ in range(MAX_API_RETRY):
+        try:
+            response = openai.ChatCompletion.create(
+                model="gpt-4",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": sys_prompt
+                    },
+                    {
+                        "role": "user",
+                        "content": user_prompt,
+                    },
+                ],
+                temperature=0.2,
+                max_tokens=max_tokens,
+            )
+            evaluation = response["choices"][0]["message"]["content"]
+            return {"evaluation": evaluation, "id": id}
+        except Exception as e:
+            print(e)
+            time.sleep(1)
+    print(f" Evaluation {id} failed after {MAX_API_RETRY} retries.")
+    return {"evaluation": "", "id": id}
+
+
+def parse_battle_score(evaluation: str) -> List[float]:
+    """
+    Parse evaluation from GPT-4 and get the scores of model 1 and 2.
+
+    Args:
+        evaluation: evaluation from GPT-4.
+
+    Returns:
+        A score pair of two different model answers.
+    """
+
+    try:
+        pattern = re.compile("([0-9]|10) out of 10")
+        sp = re.findall(pattern, evaluation)
+        if len(re.findall(pattern, evaluation)) == 2:
+            return [float(sp[0]), float(sp[1])]
+
+        pattern = re.compile("a score of ([0-9]|10)")
+        sp = re.findall(pattern, evaluation)
+        if len(re.findall(pattern, evaluation)) == 2:
+            return [float(sp[0]), float(sp[1])]
+
+        pattern = re.compile("([0-9]|10)/10")
+        sp = re.findall(pattern, evaluation)
+        if len(re.findall(pattern, evaluation)) == 2:
+            return [float(sp[0]), float(sp[1])]
+
+        score_pair = evaluation.split("\n")[0]
+        score_pair = score_pair.replace(",", " ")
+        sp = score_pair.split(" ")
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            raise Exception(f"Invalid score pair. Got {evaluation}.")
+    except Exception as e:
+        return [-1, -1]
+
+
+def battle(answer1: List[Dict], answer2: List[Dict], prompt_dict: Dict[str, Any]) -> List[Dict]:
+    """
+    Use GPT-4 to compare answers of two different models.
+
+    Args:
+        answer1: answers of model 1.
+        answer2: answers of model 2.
+        prompt_dict: prompt for battle.
+
+    Returns:
+        Evaluations of all comparison pairs.
+    """
+
+    assert len(answer1) == len(answer2)
+
+    handles = []
+    evaluation_file = []
+
+    total_len = len(answer1)
+    question_idx_list = list(range(total_len))
+
+    print(f" Total number of answers: {len(answer1)}.")
+
+    evaluations = []
+    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+        futures = []
+        for i in question_idx_list:
+            assert answer1[i]["id"] == answer2[i]["id"]
+            answer_id = answer1[i]["id"]
+
+            ques = (answer1[i]["instruction"] if answer1[i]["input"] == "" else answer1[i]["instuction"] + " " +
+                    answer1[i]["input"])
+            cat = answer1[i]["category"]
+            ans1 = answer1[i]["output"]
+            ans2 = answer2[i]["output"]
+
+            sys_prompt = prompt_dict["system_prompt"]
+            prompt_template = prompt_dict["prompt_template"]
+            prompt = prompt_template.format(
+                question=ques,
+                answer_1=ans1,
+                answer_2=ans2,
+                prompt=prompt_dict["prompt"],
+            )
+
+            future = executor.submit(get_battle_result, sys_prompt, prompt, answer_id, 2048)
+            futures.append(future)
+
+        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
+            evaluations.append(future.result())
+
+    evaluations.sort(key=lambda x: x["id"])
+
+    return evaluations
+
+
+def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_path: str) -> None:
+    """
+    Save evaluation results (model 1 vs model 2) from GPT-4.
+
+    Args:
+        evaluations: evaluation results from GPT-4.
+        name1: model 1 's name.
+        name2: model 2 's name.
+        save_path: path to save battle results.
+    """
+
+    evaluation_file = deepcopy(evaluations)
+
+    ans1_score = 0
+    ans2_score = 0
+    better_count = 0
+    worse_count = 0
+    tie_count = 0
+    invalid_count = 0
+
+    better_file = []
+    worse_file = []
+    tie_file = []
+    invalid_file = []
+
+    for idx, evaluation in enumerate(evaluations):
+        scores = parse_battle_score(evaluation["evaluation"])
+        evaluation_file[idx]["score"] = scores
+
+        if scores[0] == -1 and scores[1] == -1:
+            invalid_count += 1
+            invalid_file.append(evaluation_file[idx])
+            print(f'Invalid score pair: {evaluation_file[idx]["id"]}.')
+        else:
+            if scores[0] > scores[1]:
+                worse_count += 1
+                worse_file.append(evaluation_file[idx])
+            elif scores[0] < scores[1]:
+                better_count += 1
+                better_file.append(evaluation_file[idx])
+            else:
+                tie_count += 1
+                tie_file.append(evaluation_file[idx])
+            ans1_score += scores[0]
+            ans2_score += scores[1]
+
+    prefix = f"{name1}_vs_{name2}"
+
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    jdump(better_file, os.path.join(save_path, prefix, f"{name2}_better.json"))
+    jdump(worse_file, os.path.join(save_path, prefix, f"{name2}_worse.json"))
+    jdump(tie_file, os.path.join(save_path, prefix, f"{prefix}_tie.json"))
+    jdump(invalid_file, os.path.join(save_path, prefix, f"{prefix}_invalid.json"))
+    jdump(evaluation_file, os.path.join(save_path, prefix, f"{prefix}_evaluations.json"))
+
+    if os.path.exists(os.path.join(save_path, "battle_results.json")):
+        results = jload(os.path.join(save_path, "battle_results.json"))
+    else:
+        results = {}
+
+    results[prefix] = {
+        "model": [name1, name2],
+        "better": better_count,
+        "worse": worse_count,
+        "tie": tie_count,
+        "win_rate": better_count / (len(evaluations) - invalid_count),
+        "score": [
+            ans1_score / (len(evaluations) - invalid_count),
+            ans2_score / (len(evaluations) - invalid_count),
+        ],
+    }
+    jdump(results, os.path.join(save_path, "battle_results.json"))
+
+    print(f"Total {invalid_count} invalid score pair(s).")
+    print(f"Model {name2} has {better_count} better answer(s).")
+    print(f"Model {name2} has {worse_count} worse answer(s).")
+    print(f"{tie_count} answer(s) play(s) to a tie.")
+    print(f"Win rate of model {name2}: {better_count/(len(evaluations)-invalid_count):.2f}")
+    print(f"Model {name1} average score: {ans1_score/(len(evaluations)-invalid_count):.2f}")
+    print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}")
+
+
+def get_gpt35_evaluation(prompt: Dict[str, Any], inst: Dict[str, Any], max_tokens: int = 2048) -> Dict[str, Any]:
+    """
+    Use GPT-3.5 to evaluate one model answer.
+
+    Args:
+        prompt: a dictionary including prompt template, CoT and metrics.
+        inst: the instruction that is needed to be evaluated.
+        max_tokens: the maximum number of tokens to generate in the completion.
+
+    Returns:
+        An evaluation of one answer.
+    """
+
+    MAX_API_RETRY = 3
+
+    question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"])
+    answer = inst["output"]
+    metrics = prompt["metrics"]
+    inst["evaluation"] = {}
+
+    for metric in metrics.keys():
+        for i in range(MAX_API_RETRY):
+            try:
+                response = openai.Completion.create(
+                    model="text-davinci-003",
+                    prompt=prompt["prompt"].format(
+                        question=question,
+                        answer=answer,
+                        metric=prompt["metrics"][metric],
+                        steps=prompt["CoT"][metric],
+                    ),
+                    logprobs=5,
+                    temperature=0,
+                    max_tokens=max_tokens,
+                )
+                inst["evaluation"][metric] = {
+                    "response": response["choices"][0]["text"],
+                    "logprobs": response["choices"][0]["logprobs"]["top_logprobs"],
+                }
+                break
+            except Exception as e:
+                print(e)
+                time.sleep(1)
+    return inst
+
+
+def gpt35_evaluate(
+    answers: List[Dict],
+    prompts: List[Dict],
+    model_name: str,
+    save_path: str,
+) -> List[Dict]:
+    """
+    Use GPT-3.5 to evaluate model answers and save evaluation results.
+
+    Args:
+        answers: model answers.
+        prompts: prompts for all categories.
+        model_name: name of the model.
+        save_path: path to save GPT-3.5 evaluations.
+
+    Returns:
+        All the evaluations of the given answers.
+    """
+
+    prompt_per_category = {prompt["category"]: prompt for prompt in prompts}
+
+    data_per_category = {}
+    for answer in answers:
+        category = answer["category"]
+
+        if answer["category"] in data_per_category.keys():
+            data_per_category[category].append(answer)
+        else:
+            data_per_category[category] = [answer]
+
+    categories_str = ", ".join(x for x in list(data_per_category.keys()))
+    print(f"The evaluated categories are {categories_str}.")
+
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    all_evaluations = []
+
+    for category, data in data_per_category.items():
+        if prompt_per_category.get(category) == None:
+            print(f"No metrics for category {category}! Use category general now.")
+            prompt_per_category[category] = prompt_per_category["general"]
+
+        if os.path.exists(os.path.join(save_path, model_name, f"{category}_evaluation_results.json")):
+            print(
+                f"The evaluation file for category {category} already exists. You are now re-evaluating category {category}!"
+            )
+
+        print(f"The number of instances of category {category}'s is {len(data)}.")
+
+        evaluations = []
+
+        metrics_str = ", ".join(x for x in list(prompt_per_category[category]["metrics"].keys()))
+        print(f"Category {category}'s metrics are {metrics_str}.")
+
+        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+            futures = []
+            for inst in data:
+                future = executor.submit(get_gpt35_evaluation, prompt_per_category[category], inst, 1)
+                futures.append(future)
+
+            for future in tqdm.tqdm(
+                    concurrent.futures.as_completed(futures),
+                    desc=f"{category}: ",
+                    total=len(futures),
+            ):
+                evaluations.append(future.result())
+
+        evaluations.sort(key=lambda x: x["id"])
+
+        jdump(
+            evaluations,
+            os.path.join(save_path, model_name, f"{category}_evaluation_results.json"),
+        )
+        print(f"{category} done.")
+
+        all_evaluations.extend(evaluations)
+
+    jdump(
+        all_evaluations,
+        os.path.join(save_path, f"{model_name}_evaluation_results.json"),
+    )
+
+    return all_evaluations
+
+
+def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
+    """
+    Calculate score from log probabilities returned by text-davinci-003.
+    Only openai.Completion can return logprobs.
+
+    Calculation formula:
+        score = sum(score_i * exp(value)) where score_i is the score which corresponds to the key(predicted token) and value is its log probability.
+
+    Ref: https://arxiv.org/abs/2303.16634
+    This paper proposes NLG evaluation methods using GPT-3.5(logprobs returned by openai api) and GPT-4(logprobs obtained by sampling).
+
+    Args:
+        logprobs: logprobs returned by openai.Completion.
+
+    Returns:
+        Score of one answer.
+    """
+
+    # GPT-3.5 only returns score of 1 to 5.
+    prob = np.zeros(5)
+
+    for key, value in logprobs.items():
+        # Sometimes the key will be one byte of a unicode character which takes the form of "bytes:\\xe7".
+        # It is meaningless and thus we don't calculate probability.
+        if "bytes" in key:
+            continue
+        # results[0] is the score which corresponds to the key(predicted token).
+        # For example, key "5" corresponds to score 5.
+        results = re.findall(r"\d", key)
+        if len(results) == 1:
+            prob[int(results[0]) - 1] = prob[int(results[0]) - 1] + np.exp(value)
+
+    score = np.dot(np.arange(1, 6), prob)
+
+    return score
+
+
+def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
+    """
+    Generate statistics for one model.
+
+    Args:
+        model_name: name of the model for saving statistics.
+        evaluations: evaluations for all of the model answers.
+        save_path: path to save GPT-3.5 evaluation statistics.
+    """
+
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    data_per_category = {}
+    for evaluation in evaluations:
+        category = evaluation["category"]
+        if evaluation["category"] in data_per_category.keys():
+            data_per_category[category].append(evaluation)
+        else:
+            data_per_category[category] = [evaluation]
+
+    all_statistics = {}
+    for category, data in data_per_category.items():
+        metrics = data[0]["evaluation"].keys()
+        scores = {metric: [] for metric in metrics}
+        for evaluation in data:
+            for metric in metrics:
+                scores[metric].append(calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
+
+        statistics = {}
+        for metric in metrics:
+            arg_sort = np.argsort(scores[metric])
+            statistics[metric] = {}
+            statistics[metric]["avg_score"] = sum(scores[metric]) / len(data)
+            statistics[metric]["best_3"] = {data[i]["id"]: scores[metric][i] for i in arg_sort[-3:][::-1]}
+            statistics[metric]["worst_3"] = {data[i]["id"]: scores[metric][i] for i in arg_sort[:3]}
+
+        all_statistics[category] = statistics
+
+    jdump(
+        all_statistics,
+        os.path.join(save_path, f"{model_name}_evaluation_statistics.json"),
+    )
+
+
+def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> None:
+    """
+    Analyze and visualize all GPT-3.5 evaluation statistics in the given directory.
+
+    Args:
+        statistics_path: path to all the models' statistics.
+        save_path: path to save table and visualization results.
+    """
+
+    if not os.path.exists(statistics_path):
+        raise Exception("The given directory doesn't exist! No statistics found!")
+
+    all_statistics = {}
+
+    for file_name in os.listdir(statistics_path):
+        if file_name.endswith("_evaluation_statistics.json"):
+            model_name = file_name.split("_evaluation_statistics.json")[0]
+            all_statistics[model_name] = jload(os.path.join(statistics_path, file_name))
+
+    if len(list(all_statistics.keys())) == 0:
+        raise Exception("There are no statistics in the given directory!")
+
+    frame_all = {
+        "model": [],
+        "category": [],
+        "metric": [],
+        "avg_score": [],
+        "best_3": [],
+        "worst_3": [],
+    }
+    frame_per_category = {}
+    for model_name, model_statistics in all_statistics.items():
+        for category, category_statistics in model_statistics.items():
+            if frame_per_category.get(category) is None:
+                frame_per_category[category] = {
+                    "model": [],
+                    "metric": [],
+                    "avg_score": [],
+                    "best_3": [],
+                    "worst_3": [],
+                }
+
+            for metric, metric_statistics in category_statistics.items():
+                frame_all["model"].append(model_name)
+                frame_all["category"].append(category)
+                frame_all["metric"].append(metric)
+                frame_all["avg_score"].append(metric_statistics["avg_score"])
+                frame_all["best_3"].append(metric_statistics["best_3"])
+                frame_all["worst_3"].append(metric_statistics["worst_3"])
+
+                frame_per_category[category]["model"].append(model_name)
+                frame_per_category[category]["metric"].append(metric)
+                frame_per_category[category]["avg_score"].append(metric_statistics["avg_score"])
+                frame_per_category[category]["best_3"].append(metric_statistics["best_3"])
+                frame_per_category[category]["worst_3"].append(metric_statistics["worst_3"])
+
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    frame_all = pd.DataFrame(frame_all)
+    frame_all.to_csv(os.path.join(save_path, "gpt35_evaluation_statistics.csv"))
+
+    for category in tqdm.tqdm(
+            frame_per_category.keys(),
+            desc=f"category: ",
+            total=len(frame_per_category.keys()),
+    ):
+        data = pd.DataFrame(frame_per_category[category])
+
+        sns.set()
+        fig = plt.figure(figsize=(16, 10))
+        plt.ylim((0, 5))
+
+        fig = sns.barplot(x="metric", y="avg_score", hue="model", data=data, dodge=True)
+        fig.set_title(f"Comparison between Different Models for Category {category.title()}")
+        plt.xlabel("Evaluation Metric")
+        plt.ylabel("Average Score")
+
+        figure = fig.get_figure()
+        figure.savefig(os.path.join(save_path, f"{category}.png"), dpi=400)