hpcaitech · chengeharrison · May 24, 2023 · May 24, 2023
diff --git a/applications/Chat/evaluate/config/config_cn.json b/applications/Chat/evaluate/config/config_cn.json
@@ -0,0 +1,133 @@
+{
+  "language": "cn",
+  "category": {
+    "brainstorming": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "creativity",
+        "practicality",
+        "correctness"
+      ],
+      "GPT-4": [],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "chat": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "naturalness",
+        "engagingness",
+        "reasonableness"
+      ],
+      "GPT-4": [],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "classification": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "GPT-4": [],
+      "Metrics": [
+        "Precision",
+        "Recall",
+        "F1 score"
+      ]
+    },
+    "closed_qa": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "GPT-4": [],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "extraction": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "GPT-4": [],
+      "Metrics": [
+        "Precision",
+        "Recall",
+        "F1 score"
+      ]
+    },
+    "generation": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "diversity"
+      ],
+      "GPT-4": [],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "open_qa": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "GPT-4": [],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "rewriting": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "GPT-4": [],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "roleplay": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "fidelity",
+        "creativity"
+      ],
+      "GPT-4": [],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "summarization": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "correctness",
+        "conciseness"
+      ],
+      "GPT-4": [],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    }
+  }
+}
diff --git a/applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_cn.json b/applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_cn.json
@@ -0,0 +1,6 @@
+{
+  "id": 1,
+  "system_prompt": "你是一个检查回答质量的好助手。",
+  "prompt_template": "[问题]\n{question}\n\n[1号AI助手的答案]\n{answer_1}\n\n[1号AI助手答案终止]\n\n[2号AI助手的答案]\n{answer_2}\n\n[2号AI助手答案终止]\n\n[要求]\n{prompt}\n\n",
+  "prompt": "我们需要你评价这两个AI助手回答的性能。\n请对他们的回答的有用性、相关性、准确性、详细程度进行评分。每个AI助手都会得到一个1到10分的总分，分数越高表示整体表现越好。\n请首先输出一行，该行只包含两个数值，分别表示1号和2号AI助手的分数。这两个分数之间要有一个空格。在随后的一行中，请对你的评价作出全面的解释，避免任何潜在的偏见，并确保AI助手回答的顺序不会影响您的判断。"
+}