Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 132 additions & 0 deletions packages/prime/tests/test_env_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
"""Tests for prime env eval command with Prime Inference"""

import os
import subprocess
import tempfile

import pytest

# Use a small/fast model for testing
TEST_MODEL = "deepseek/deepseek-chat"


@pytest.fixture(scope="module")
def install_math_env():
"""Install the single-turn-math environment for testing"""
result = subprocess.run(
[
"uv",
"pip",
"install",
"single_turn_math",
"--extra-index-url",
"https://hub.primeintellect.ai/primeintellect/simple/",
],
capture_output=True,
text=True,
)
if result.returncode != 0:
pytest.skip(f"Failed to install single_turn_math: {result.stderr}")
yield
# Cleanup: uninstall after tests
subprocess.run(
["uv", "pip", "uninstall", "single_turn_math", "-y"],
capture_output=True,
)


def test_env_eval_single_turn_math(install_math_env):
"""Test running prime env eval with single_turn_math environment

This test runs a minimal evaluation (1 example, 1 rollout) against
Prime Inference to verify the end-to-end eval pipeline works.
"""
# Run from a temp directory to avoid polluting the source tree with results
with tempfile.TemporaryDirectory() as tmpdir:
result = subprocess.run(
[
"uv",
"run",
"prime",
"env",
"eval",
"single_turn_math",
"-m",
TEST_MODEL,
"-n",
"1", # 1 example
"-r",
"1", # 1 rollout
],
capture_output=True,
text=True,
timeout=600, # 10 minute timeout
cwd=tmpdir,
env={**os.environ, "PRIME_API_KEY": os.environ.get("PRIME_API_KEY", "")},
)

print(f"stdout: {result.stdout}")
print(f"stderr: {result.stderr}")

assert result.returncode == 0, f"Eval failed: {result.stderr}\n{result.stdout}"


def test_env_eval_invalid_model(install_math_env):
"""Test that prime env eval fails gracefully with invalid model"""
with tempfile.TemporaryDirectory() as tmpdir:
result = subprocess.run(
[
"uv",
"run",
"prime",
"env",
"eval",
"single_turn_math",
"-m",
"nonexistent/fake-model-12345",
"-n",
"1",
"-r",
"1",
],
capture_output=True,
text=True,
timeout=120,
cwd=tmpdir,
env={**os.environ, "PRIME_API_KEY": os.environ.get("PRIME_API_KEY", "")},
)

# Should fail with non-zero exit code
assert result.returncode != 0, f"Expected failure but got: {result.stdout}"
# Should show error about invalid model
output = result.stdout.lower() + result.stderr.lower()
assert "invalid model" in output or "not found" in output


def test_env_eval_missing_environment():
"""Test that prime env eval fails gracefully with missing environment"""
with tempfile.TemporaryDirectory() as tmpdir:
result = subprocess.run(
[
"uv",
"run",
"prime",
"env",
"eval",
"nonexistent_env_xyz_12345",
"-m",
TEST_MODEL,
"-n",
"1",
"-r",
"1",
],
capture_output=True,
text=True,
timeout=120,
cwd=tmpdir,
env={**os.environ, "PRIME_API_KEY": os.environ.get("PRIME_API_KEY", "")},
)

# Should fail with non-zero exit code
assert result.returncode != 0, f"Expected failure but got: {result.stdout}\n{result.stderr}"