diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py index 6fac151b..34c10f57 100644 --- a/scripts/eval_from_generations.py +++ b/scripts/eval_from_generations.py @@ -845,9 +845,8 @@ def main(config: EvalConfig): batch_eval(total_work, config, dataset, run_dir, eval_file_path) - # Calculate pass@k metrics if multiple samples per problem were evaluated - if config.num_samples_per_problem > 1: - calculate_pass_at_k(eval_file_path, config.pass_at_k_values) + # Calculate pass@k metrics + calculate_pass_at_k(eval_file_path, config.pass_at_k_values) def calc_pass_at_k(n, c, k): diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py index fce1b16f..2df0fac1 100644 --- a/scripts/generate_and_eval_single_sample.py +++ b/scripts/generate_and_eval_single_sample.py @@ -55,6 +55,8 @@ def __init__(self): self.model_name = REQUIRED self.max_tokens = None self.temperature = None + self.server_address = None + self.server_port = None # Reasoning model specific parameters self.is_reasoning_model = False # set to True for o1, o3, Gemini 2.5 thinking, etc. @@ -154,6 +156,8 @@ def main(config: EvalConfig): is_reasoning_model=config.is_reasoning_model, reasoning_effort=config.reasoning_effort, budget_tokens=config.budget_tokens, + server_address=config.server_address, + server_port=config.server_port, ) # Prompt Construction (Note: could be shortened in future PR) diff --git a/scripts/generate_samples.py b/scripts/generate_samples.py index 2c01ee8d..fce14c3a 100644 --- a/scripts/generate_samples.py +++ b/scripts/generate_samples.py @@ -57,6 +57,8 @@ def __init__(self): self.model_name = None self.max_tokens = None self.temperature = 0.0 + self.server_address = None + self.server_port = None # Reasoning model specific parameters self.is_reasoning_model = False # set to True for o1, o3, Gemini 2.5 thinking, etc. @@ -153,13 +155,13 @@ def generate_sample_single( # uses the default set of forbidden and warning patterns, # you could adapt the patterns to your own setting (degree of banning cuda stream, allowing some torch ops) ) - assert static_check_status, f"Static check failed for sample {work.sample_id} for problem {problem_number}: {problem_name}. Error: {error}. Warnings: {warnings}" + assert static_check_status, f"Static check failed for sample {work.sample_id} for problem {work.problem_id}: {problem_name}. Error: {error}. Warnings: {warnings}" if warnings: - print(f"Static check warnings for sample {work.sample_id} for problem {problem_number}: {problem_name}. Warnings: {warnings}") + print(f"Static check warnings for sample {work.sample_id} for problem {work.problem_id}: {problem_name}. Warnings: {warnings}") if config.verbose: print( - f"Generated sample {work.sample_id} for problem {problem_number}: {problem_name}" + f"Generated sample {work.sample_id} for problem {work.problem_id}: {problem_name}" ) # Store to local file @@ -321,6 +323,8 @@ def main(config: GenerationConfig): is_reasoning_model=config.is_reasoning_model, reasoning_effort=config.reasoning_effort, budget_tokens=config.budget_tokens, + server_address=config.server_address, + server_port=config.server_port, ) # Launch workers diff --git a/src/kernelbench/eval.py b/src/kernelbench/eval.py index dd79b2c0..102e321a 100644 --- a/src/kernelbench/eval.py +++ b/src/kernelbench/eval.py @@ -518,8 +518,12 @@ def eval_kernel_against_ref( print( f"[Eval] Lock file error during compilation, Please retry. Error: {e}" ) + metadata["compilation_error_name"] = "LockFileError" + metadata["compilation_error"] = f"Lock file error during concurrent compilation: {e}" graceful_eval_cleanup(context, device, tempfile) - return None + return KernelExecResult( + compiled=False, correctness=False, metadata=metadata + ) else: metadata["compilation_error_name"] = get_error_name(e) metadata["compilation_error"] = e