diff --git a/src/openbench/evals/gpqa_diamond.py b/src/openbench/evals/gpqa_diamond.py index 35731080..ab197120 100644 --- a/src/openbench/evals/gpqa_diamond.py +++ b/src/openbench/evals/gpqa_diamond.py @@ -9,20 +9,23 @@ # There is one difference between this and the original gpqa simple eval - the prompts are not reshuffled for every epoch. Shouldn't be that big of a deal, but worth noting. def record_to_mcq_sample(record: dict) -> MCQSample: """Convert a GQPQA Diamond record to an openbench MCQSample.""" - random.seed(0) + question = record["Question"] options = [ record["Correct Answer"], record["Incorrect Answer 1"], record["Incorrect Answer 2"], record["Incorrect Answer 3"], ] - random.shuffle(options) + # Use a per-question random state based on question hash + # This ensures consistent ordering across runs while varying across samples + rng = random.Random(hash(question) % (2**32)) + rng.shuffle(options) # Get index of correct answer and convert to A, B, C, D correct_index = options.index(record["Correct Answer"]) correct_letter = "ABCD"[correct_index] return MCQSample( input=MULTIPLE_CHOICE_PROMPT_TEMPLATE.format( - prompt=record["Question"], + prompt=question, option_a=options[0], option_b=options[1], option_c=options[2], diff --git a/uv.lock b/uv.lock index 1ac87d42..29e7f1e9 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.13'", @@ -2541,7 +2541,7 @@ wheels = [ [[package]] name = "openbench" -version = "0.5.2" +version = "0.5.3" source = { editable = "." } dependencies = [ { name = "anthropic" },