Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions applications/Chat/evaluate/gpt_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
"""
Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.

Temprature is set to 0 to make the model more deterministic.
Temperature is set to 0 to make the model more deterministic.

Args:
prompt: a dictionary including prompt template, CoT and metrics.
Expand Down Expand Up @@ -435,7 +435,7 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
Use completion model(text-davinci-003) to evaluate one model answer.
Only completion models can return log probabilities.

Temprature is set to 0 to make the model more deterministic.
Temperature is set to 0 to make the model more deterministic.

Args:
prompt: a dictionary including prompt template, CoT and metrics.
Expand Down Expand Up @@ -593,7 +593,7 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int:
"""
Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
Different from text-davinci-003, this function directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.

Args:
Expand Down
4 changes: 2 additions & 2 deletions applications/Chat/evaluate/unieval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def evaluate(self, data, category):
n_data = len(data)
eval_scores = [{} for _ in range(n_data)]

# Calculate average sentence-level scores for facutal consistency
# Calculate average sentence-level scores for factual consistency
src_list, output_list = [], []
n_sents = [] # the number of sentences in the claim
for i in range(n_data):
Expand All @@ -288,7 +288,7 @@ def evaluate(self, data, category):
src_list.append(source)
output_list.append(system_outputs[j])
input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task)
sent_score = self.scorer.score(input_list, self.task, category, dim)
sent_score = self.scorer.score(input_list, self.task, category, self.dim)

# Get average score for each sample
start_idx = 0
Expand Down
2 changes: 1 addition & 1 deletion applications/Chat/evaluate/unieval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def add_question(dimension, output, src=None, ref=None, context=None, task=None)
src: source input for different NLG tasks. For example, source document for summarization
and dialogue history for dialogue response generation.
output: output text generated by the models
ref: human-annotataed groundtruth
ref: human-annotated groundtruth
context: the context needed to evaluate several specific dimension. For example,
additional factual information when evaluating engagingness and groundedness in dialogues.
"""
Expand Down
4 changes: 2 additions & 2 deletions applications/Chat/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def gather_and_equal(tensor: torch.Tensor) -> bool:


def run_test_data(strategy):
EXPERINCE_BATCH_SIZE = 4
EXPERIENCE_BATCH_SIZE = 4
SAMPLE_BATCH_SIZE = 2

if strategy == 'ddp':
Expand All @@ -54,7 +54,7 @@ def run_test_data(strategy):

# experience of all ranks should be the same
for _ in range(2):
data = get_data(EXPERINCE_BATCH_SIZE)
data = get_data(EXPERIENCE_BATCH_SIZE)
assert gather_and_equal(data['input_ids'])
assert gather_and_equal(data['attention_mask'])
experience = experience_maker.make_experience(**data,
Expand Down