diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index e7a73ed..4c76143 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -23,10 +23,33 @@ make dev-frontend # start Vite dev server (port 5173) with HMR make dev-bundle # build UI, serve full bundled experience at port 8001 via uv run ``` -Standard development uses `dev-backend` + `dev-frontend` in separate terminals. The Vite dev server proxies nothing — the frontend calls the backend at `http://localhost:8001` directly via CORS. +Standard development uses `dev-backend` + `dev-frontend` in separate terminals. The Vite dev server proxies nothing; the frontend calls the backend at `http://localhost:8001` directly via CORS. `dev-bundle` is useful for testing the bundled UI experience without building a wheel. It copies `ui/dist` into the source tree temporarily and cleans up when the server exits. +### Postgres backend (optional, for `/api/runs`) + +The default in-memory backend keeps `make dev-backend` zero-config. To exercise the async run pipeline locally, bring up a Postgres alongside the app: + +```bash +make pg-up # start postgres:17-alpine in a docker container (port 5432, ephemeral via --rm) +make migrate # apply the agentevals schema +make dev-backend-pg # pg-up + migrate + serve --dev with backend=postgres wired up +make pg-down # stop the container; data is discarded with --rm +``` + +Override the defaults via `PG_PORT=5433 make pg-up` etc. The `migrate` target is idempotent (a second invocation is a no-op). + +Once running, submit a run with: + +```bash +curl -X POST http://localhost:8001/api/runs \ + -H 'content-type: application/json' \ + -d '{"spec": {"approach": "trace_replay", "target": {"kind": "inline", "inline": {...}}, "evalConfig": {"metrics": ["tool_trajectory_avg_score"]}}}' +``` + +Then poll `GET /api/runs/{runId}` and `GET /api/runs/{runId}/results`. Without `storage.backend=postgres`, the `/api/runs` endpoints return 503 with a hint pointing at the env var. + ### Building ```bash diff --git a/Dockerfile b/Dockerfile index f1eb0d8..d43d63c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,7 +24,7 @@ COPY src ./src COPY --from=ui /build/ui/dist ./src/agentevals/_static -RUN uv sync --frozen --no-dev --extra live \ +RUN uv sync --frozen --no-dev --extra live --extra postgres \ && groupadd --gid 1000 app \ && useradd --uid 1000 --gid app --home-dir /app --no-log-init app \ && chown -R app:app /app diff --git a/Makefile b/Makefile index cee2922..32147f9 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,14 @@ HELM_CHART_DIR ?= charts/agentevals HELM_CHART_OCI_URL ?= $(HELM_REPO)/helm HELM_CHART_VERSION ?= $(VERSION) -.PHONY: build build-bundle build-docker build-ui release clean dev-backend dev-frontend dev-bundle test test-unit test-integration test-e2e helm-lint helm-template helm-test helm-cleanup helm-package helm-publish +.PHONY: build build-bundle build-docker build-ui release clean dev-backend dev-backend-pg dev-frontend dev-bundle pg-up pg-down migrate test test-unit test-integration test-e2e helm-lint helm-template helm-test helm-cleanup helm-package helm-publish + +PG_CONTAINER ?= agentevals-pg +PG_PORT ?= 5432 +PG_USER ?= agentevals +PG_PASSWORD ?= agentevals +PG_DATABASE ?= agentevals +PG_DSN ?= postgresql://$(PG_USER):$(PG_PASSWORD)@localhost:$(PG_PORT)/$(PG_DATABASE) build: uv build @@ -53,6 +60,30 @@ release: clean build-ui dev-backend: uv run agentevals serve --dev +pg-up: + @if [ -z "$$(docker ps -q -f name=^/$(PG_CONTAINER)$$)" ]; then \ + docker run -d --rm --name $(PG_CONTAINER) \ + -e POSTGRES_USER=$(PG_USER) \ + -e POSTGRES_PASSWORD=$(PG_PASSWORD) \ + -e POSTGRES_DB=$(PG_DATABASE) \ + -p $(PG_PORT):5432 postgres:17-alpine; \ + else \ + echo "container $(PG_CONTAINER) already running"; \ + fi + @until docker exec $(PG_CONTAINER) pg_isready -U $(PG_USER) >/dev/null 2>&1; do sleep 1; done + @echo "Postgres ready at $(PG_DSN)" + +pg-down: + -docker stop $(PG_CONTAINER) + +migrate: + AGENTEVALS_DATABASE_URL=$(PG_DSN) uv run agentevals migrate up + +dev-backend-pg: pg-up migrate + AGENTEVALS_STORAGE_BACKEND=postgres \ + AGENTEVALS_DATABASE_URL=$(PG_DSN) \ + uv run agentevals serve --dev + dev-frontend: cd ui && npm run dev diff --git a/README.md b/README.md index c25b278..52b2e91 100644 --- a/README.md +++ b/README.md @@ -286,6 +286,24 @@ The source for the chart lives in [`charts/agentevals/`](charts/agentevals/) if See the [Kubernetes example](examples/kubernetes/README.md) for an end-to-end walkthrough deploying agentevals alongside kagent and an OTel Collector on Kubernetes. +#### Postgres backend (`/api/runs`) + +By default the chart deploys agentevals with an in-memory backend; runs and results are not persisted. To enable the async `POST /api/runs` pipeline with durable Postgres-backed state: + +```bash +# Bundled Postgres (dev / evaluation only): +helm install agentevals oci://ghcr.io/agentevals-dev/agentevals/helm/agentevals \ + --set storage.backend=postgres \ + --set database.postgres.bundled.enabled=true + +# Or supply an external Postgres DSN: +helm install agentevals oci://ghcr.io/agentevals-dev/agentevals/helm/agentevals \ + --set storage.backend=postgres \ + --set database.postgres.url='postgresql://user:pass@host:5432/dbname' +``` + +When `storage.backend=postgres` the app applies any pending schema migrations on startup (advisory-lock protected, safe across replicas) and starts an in-process worker that processes the run queue. Without `storage.backend=postgres` the `/api/runs` endpoints return 503 with a hint pointing at the env var. + ## MCP Server Exposes evaluation tools to MCP clients. A `.mcp.json` at the project root lets Claude Code pick it up automatically. diff --git a/charts/agentevals/templates/_helpers.tpl b/charts/agentevals/templates/_helpers.tpl index 13f3cc6..6672e29 100644 --- a/charts/agentevals/templates/_helpers.tpl +++ b/charts/agentevals/templates/_helpers.tpl @@ -48,6 +48,17 @@ app.kubernetes.io/name: {{ include "agentevals.name" . }} app.kubernetes.io/instance: {{ .Release.Name }} {{- end }} +{{- /* +Selector labels scoped to the main app Pod and its Service. Carries the +``app.kubernetes.io/component: agentevals`` discriminator so the agentevals +Service does not also match the bundled Postgres Pod (which carries +``app.kubernetes.io/component: database`` instead). +*/ -}} +{{- define "agentevals.app.selectorLabels" -}} +{{ include "agentevals.selectorLabels" . }} +app.kubernetes.io/component: agentevals +{{- end }} + {{- define "agentevals.serviceAccountName" -}} {{- if .Values.serviceAccount.create }} {{- default (include "agentevals.fullname" .) .Values.serviceAccount.name }} @@ -55,3 +66,25 @@ app.kubernetes.io/instance: {{ .Release.Name }} {{- default "default" .Values.serviceAccount.name }} {{- end }} {{- end }} + +{{/* +Service name for the bundled Postgres instance. +*/}} +{{- define "agentevals.postgresqlServiceName" -}} +{{- printf "%s-postgresql" (include "agentevals.fullname" .) -}} +{{- end -}} + +{{/* +Bundled Postgres image reference (registry/repository/name:tag). +*/}} +{{- define "agentevals.postgresql.image" -}} +{{- $pg := .Values.database.postgres.bundled -}} +{{- printf "%s/%s/%s:%s" $pg.image.registry $pg.image.repository $pg.image.name $pg.image.tag -}} +{{- end -}} + +{{/* +Secret name holding POSTGRES_PASSWORD for the bundled Postgres instance. +*/}} +{{- define "agentevals.passwordSecretName" -}} +{{- printf "%s-postgresql" (include "agentevals.fullname" .) -}} +{{- end -}} diff --git a/charts/agentevals/templates/deployment.yaml b/charts/agentevals/templates/deployment.yaml index 3a56b25..e8852dc 100644 --- a/charts/agentevals/templates/deployment.yaml +++ b/charts/agentevals/templates/deployment.yaml @@ -9,7 +9,7 @@ spec: replicas: {{ .Values.replicaCount }} selector: matchLabels: - {{- include "agentevals.selectorLabels" . | nindent 6 }} + {{- include "agentevals.app.selectorLabels" . | nindent 6 }} template: metadata: {{- with .Values.podAnnotations }} @@ -17,7 +17,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} labels: - {{- include "agentevals.selectorLabels" . | nindent 8 }} + {{- include "agentevals.app.selectorLabels" . | nindent 8 }} {{- with .Values.podLabels }} {{- toYaml . | nindent 8 }} {{- end }} @@ -65,6 +65,29 @@ spec: - name: HOME value: "/tmp/agentevals-home" {{- end }} + {{- if eq .Values.storage.backend "postgres" }} + - name: AGENTEVALS_STORAGE_BACKEND + value: "postgres" + - name: AGENTEVALS_DATABASE_SCHEMA + value: {{ .Values.database.postgres.schema | quote }} + {{- if .Values.database.postgres.urlFile }} + - name: AGENTEVALS_DATABASE_URL_FILE + value: {{ .Values.database.postgres.urlFile | quote }} + {{- else if .Values.database.postgres.url }} + - name: AGENTEVALS_DATABASE_URL + value: {{ .Values.database.postgres.url | quote }} + {{- else if .Values.database.postgres.bundled.enabled }} + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "agentevals.passwordSecretName" . }} + key: POSTGRES_PASSWORD + - name: AGENTEVALS_DATABASE_URL + value: {{ printf "postgresql://agentevals:$(POSTGRES_PASSWORD)@%s.%s.svc.cluster.local:5432/agentevals?sslmode=disable" (include "agentevals.postgresqlServiceName" .) (include "agentevals.namespace" .) | quote }} + {{- else }} + {{ fail "storage.backend=postgres requires database.postgres.url, database.postgres.urlFile, or database.postgres.bundled.enabled=true" }} + {{- end }} + {{- end }} {{- with .Values.env }} {{- toYaml . | nindent 12 }} {{- end }} diff --git a/charts/agentevals/templates/postgresql-secret.yaml b/charts/agentevals/templates/postgresql-secret.yaml new file mode 100644 index 0000000..21daab6 --- /dev/null +++ b/charts/agentevals/templates/postgresql-secret.yaml @@ -0,0 +1,13 @@ +{{- if and (eq .Values.storage.backend "postgres") .Values.database.postgres.bundled.enabled (not .Values.database.postgres.url) (not .Values.database.postgres.urlFile) }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "agentevals.passwordSecretName" . }} + namespace: {{ include "agentevals.namespace" . }} + labels: + {{- include "agentevals.labels" . | nindent 4 }} + app.kubernetes.io/component: database +type: Opaque +data: + POSTGRES_PASSWORD: {{ "agentevals" | b64enc | quote }} +{{- end }} diff --git a/charts/agentevals/templates/postgresql.yaml b/charts/agentevals/templates/postgresql.yaml new file mode 100644 index 0000000..c4e5370 --- /dev/null +++ b/charts/agentevals/templates/postgresql.yaml @@ -0,0 +1,142 @@ +{{- if and (eq .Values.storage.backend "postgres") .Values.database.postgres.bundled.enabled (not .Values.database.postgres.url) (not .Values.database.postgres.urlFile) }} +{{- $pg := .Values.database.postgres.bundled }} +{{- $fullname := include "agentevals.postgresqlServiceName" . }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ $fullname }} + namespace: {{ include "agentevals.namespace" . }} + labels: + {{- include "agentevals.labels" . | nindent 4 }} + app.kubernetes.io/component: database +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ $fullname }} + namespace: {{ include "agentevals.namespace" . }} + labels: + {{- include "agentevals.labels" . | nindent 4 }} + app.kubernetes.io/component: database +spec: + accessModes: + - ReadWriteOnce + {{- if $pg.storageClassName }} + storageClassName: {{ $pg.storageClassName | quote }} + {{- end }} + resources: + requests: + storage: {{ $pg.storage | quote }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ $fullname }} + namespace: {{ include "agentevals.namespace" . }} + labels: + {{- include "agentevals.labels" . | nindent 4 }} + app.kubernetes.io/component: database +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + {{- include "agentevals.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: database + template: + metadata: + labels: + {{- include "agentevals.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: database + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ $fullname }} + securityContext: + fsGroup: 999 + runAsUser: 999 + runAsGroup: 999 + runAsNonRoot: true + containers: + - name: postgresql + image: {{ include "agentevals.postgresql.image" . }} + imagePullPolicy: {{ $pg.image.pullPolicy }} + securityContext: + allowPrivilegeEscalation: false + ports: + - name: postgresql + containerPort: 5432 + protocol: TCP + env: + - name: POSTGRES_DB + value: "agentevals" + - name: POSTGRES_USER + value: "agentevals" + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "agentevals.passwordSecretName" . }} + key: POSTGRES_PASSWORD + - name: PGDATA + value: /var/lib/postgresql/data/pgdata + livenessProbe: + exec: + command: + - pg_isready + - -U + - agentevals + - -d + - agentevals + initialDelaySeconds: 20 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + successThreshold: 1 + readinessProbe: + exec: + command: + - pg_isready + - -U + - agentevals + - -d + - agentevals + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + successThreshold: 1 + {{- with $pg.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + volumeMounts: + - name: data + mountPath: /var/lib/postgresql/data + volumes: + - name: data + persistentVolumeClaim: + claimName: {{ $fullname }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ $fullname }} + namespace: {{ include "agentevals.namespace" . }} + labels: + {{- include "agentevals.labels" . | nindent 4 }} + app.kubernetes.io/component: database +spec: + type: ClusterIP + ports: + - name: postgresql + port: 5432 + targetPort: postgresql + protocol: TCP + selector: + {{- include "agentevals.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: database +{{- end }} diff --git a/charts/agentevals/templates/service.yaml b/charts/agentevals/templates/service.yaml index 090ff3c..f224c08 100644 --- a/charts/agentevals/templates/service.yaml +++ b/charts/agentevals/templates/service.yaml @@ -25,4 +25,4 @@ spec: targetPort: mcp protocol: TCP selector: - {{- include "agentevals.selectorLabels" . | nindent 4 }} + {{- include "agentevals.app.selectorLabels" . | nindent 4 }} diff --git a/charts/agentevals/values.yaml b/charts/agentevals/values.yaml index f455af3..17a3571 100644 --- a/charts/agentevals/values.yaml +++ b/charts/agentevals/values.yaml @@ -2,7 +2,10 @@ # Global # ============================================================================== -# -- Number of replicas. Only 1 is supported (no shared job state across pods). +# -- Number of replicas. The default in-memory backend has no shared state, so +# scale beyond 1 only when storage.backend is "postgres" (durable runs/results +# in Postgres are safe to share across replicas via SELECT FOR UPDATE SKIP +# LOCKED claim semantics). replicaCount: 1 # -- Global container image registry (prepended to image.repository) @@ -155,3 +158,67 @@ env: [] # -- Extra envFrom sources (ConfigMapRef, SecretRef) envFrom: [] + +# ============================================================================== +# STORAGE +# ============================================================================== + +storage: + # -- Storage backend. "memory" (default) keeps the developer experience + # zero-config: nothing persisted, restarts lose in-flight state. "postgres" + # enables /api/runs and persists runs + results in Postgres. + backend: memory + +# ============================================================================== +# DATABASE CONFIGURATION +# ============================================================================== +# Used only when storage.backend is "postgres". Priority order (first match wins): +# 1. database.postgres.urlFile -- file-based DSN (workload identity friendly) +# 2. database.postgres.url -- literal DSN +# 3. database.postgres.bundled -- chart-bundled Postgres (dev/eval only) +# If none is configured the chart fails to render. + +database: + postgres: + # -- External Postgres connection string. + # When set, takes precedence over the bundled instance regardless of + # database.postgres.bundled.enabled. + url: "" + # -- Path to a file containing the connection string. Takes precedence + # over url when set. Useful for projected workload-identity tokens. + urlFile: "" + # -- Postgres schema to use for agentevals tables. + schema: agentevals + # -- Bundled Postgres instance for development and evaluation only. + # Not suitable for production. Deployed when enabled is true and url / + # urlFile are not set. + bundled: + # -- Set to true to deploy a chart-managed Postgres alongside the app. + # Off by default so the zero-config install stays in-memory. + enabled: false + image: + # -- Bundled Postgres image registry + registry: docker.io + # -- Bundled Postgres image repository (org/namespace) + repository: library + # -- Bundled Postgres image name + name: postgres + # -- Bundled Postgres image tag + tag: "17" + # -- Bundled Postgres image pull policy + pullPolicy: IfNotPresent + # -- PersistentVolumeClaim size for the bundled Postgres data + storage: 1Gi + # -- StorageClass for the PVC. Defaults to the cluster default when empty. + storageClassName: "" + # The database name, user, and password are hardcoded for the bundled + # instance (all: "agentevals"). This is intentional for a dev/eval + # setup. Switch to an external database for production. + # -- Resource requests/limits for the bundled Postgres container + resources: + requests: + cpu: 250m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi diff --git a/pyproject.toml b/pyproject.toml index 936938c..6e0f391 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,9 @@ streaming = [ openai = [ "openai>=2.0", ] +postgres = [ + "asyncpg>=0.30.0", +] [project.scripts] agentevals = "agentevals.cli:main" @@ -43,6 +46,9 @@ artifacts = ["src/agentevals/_static/**"] [tool.hatch.build.targets.wheel] packages = ["src/agentevals"] +[tool.hatch.build.targets.wheel.force-include] +"src/agentevals/storage/postgres/migrations" = "agentevals/storage/postgres/migrations" + [tool.uv.workspace] members = ["packages/evaluator-sdk-py"] diff --git a/src/agentevals/api/app.py b/src/agentevals/api/app.py index ec3b3dd..4790510 100644 --- a/src/agentevals/api/app.py +++ b/src/agentevals/api/app.py @@ -16,13 +16,20 @@ from agentevals import __version__ +from ..run.service import RunService +from ..run.worker import AsyncRunWorker +from ..storage import StorageSettings, build_repos +from ..storage.postgres.migrator import Migrator from ..utils.log_buffer import log_buffer from .debug_routes import debug_router from .routes import router +from .runs_routes import runs_router if TYPE_CHECKING: from ..streaming.ws_server import StreamingTraceManager +logger = logging.getLogger(__name__) + try: from dotenv import load_dotenv @@ -51,7 +58,39 @@ async def lifespan(app: FastAPI): mgr = getattr(app.state, "trace_manager", None) if mgr: mgr.start_cleanup_task() + + storage_settings: StorageSettings | None = None + worker: AsyncRunWorker | None = None + try: + storage_settings = StorageSettings.from_env() + except Exception as exc: + logger.error("Storage configuration invalid; /api/runs will not be available: %s", exc) + + if storage_settings is not None and storage_settings.backend == "postgres": + logger.info("Applying any pending migrations to schema '%s'", storage_settings.schema_name) + migrator = Migrator( + dsn=storage_settings.database_url or "", + schema=storage_settings.schema_name, + lock_timeout_s=storage_settings.migrate_lock_timeout_s, + ) + await migrator.up() + + repos = await build_repos(storage_settings) + app.state.storage_settings = storage_settings + app.state.repos = repos + app.state.run_service = RunService(repos.runs, repos.results) + + worker = AsyncRunWorker(runs=repos.runs, results=repos.results, settings=storage_settings) + await worker.start() + app.state.run_worker = worker + yield + + if worker is not None: + await worker.stop() + repos = getattr(app.state, "repos", None) + if repos is not None: + await repos.close() if mgr: await mgr.shutdown() ae_logger.removeHandler(log_buffer) @@ -83,6 +122,7 @@ def create_app( app.include_router(router, prefix="/api") app.include_router(debug_router, prefix="/api/debug") + app.include_router(runs_router, prefix="/api") if trace_manager is not None: app.state.trace_manager = trace_manager diff --git a/src/agentevals/api/routes.py b/src/agentevals/api/routes.py index c65b1af..7538b51 100644 --- a/src/agentevals/api/routes.py +++ b/src/agentevals/api/routes.py @@ -22,6 +22,7 @@ BuiltinMetricDef, CodeEvaluatorDef, CustomEvaluatorDef, + EvalParams, EvalRunConfig, OpenAIEvalDef, ) @@ -68,6 +69,71 @@ def _camel_keys(obj: Any) -> Any: return obj +def _load_eval_set_dict(path: str | None) -> dict | None: + """Read the uploaded eval set file back into a dict for persistence. + + The on-disk file gets cleaned up with the temp dir; capturing the dict + here lets us store it on the run row so a future ``GET /api/runs/{id}`` + can show what was evaluated against without re-uploading the file. + """ + if not path: + return None + try: + with open(path) as f: + return json.load(f) + except (OSError, json.JSONDecodeError): + logger.warning("could not re-read eval_set file at %s for persistence", path) + return None + + +async def _maybe_persist_evaluate_run( + request: Request, + *, + params: "EvalParams", + eval_set_dict: dict | None, + trace_format: str | None, + upload_filenames: list[str] | None, + run_result: "RunResult", +) -> str | None: + """Persist a synchronously-completed eval as a Run + Result rows when + ``app.state.run_service`` is configured (i.e. ``backend=postgres``). + + Returns the synthesized ``run_id`` so the caller can attach it to the + response (UI / SSE clients can then ``GET /api/runs/{id}/results`` to + pull historical context). Returns None on the memory backend so callers + keep their existing zero-config behavior. Errors are logged but never + propagated; if persistence fails the eval result is still returned to + the caller. + """ + service = getattr(request.app.state, "run_service", None) + if service is None: + return None + try: + from ..run.service import RunService + from ..storage.models import RunSpec, TraceTarget + + filenames = list(upload_filenames or []) + target = TraceTarget( + kind="uploaded", + trace_format=trace_format if trace_format in ("jaeger-json", "otlp-json") else None, + trace_count=len(filenames), + trace_files=filenames, + ) + spec_payload = params.model_dump(by_alias=False) + spec = RunSpec( + approach="trace_replay", + target=target, + eval_config=spec_payload, + eval_set=eval_set_dict, + ) + assert isinstance(service, RunService) + run = await service.record_completed_eval(spec=spec, params=params, run_result=run_result) + return str(run.run_id) + except Exception: + logger.exception("failed to persist /api/evaluate run; eval result still returned to caller") + return None + + router = APIRouter() _MAX_JSON_BODY_BYTES = 50 * 1024 * 1024 # 50 MB (multipart endpoints allow 10 MB per file) @@ -434,6 +500,7 @@ async def convert_trace_files( @router.post("/evaluate", response_model=StandardResponse[RunResult]) async def evaluate_traces( + request: Request, trace_files: list[UploadFile] = File(...), config: str = Form(...), eval_set_file: UploadFile | None = File(None), @@ -542,6 +609,17 @@ async def evaluate_traces( logger.info(f"Evaluating {len(trace_paths)} trace file(s) with metrics: {metrics}") result = await run_evaluation(eval_config) + run_id = await _maybe_persist_evaluate_run( + request, + params=eval_config, + eval_set_dict=_load_eval_set_dict(eval_set_path), + trace_format=eval_config.trace_format, + upload_filenames=[tf.filename for tf in trace_files if tf.filename], + run_result=result, + ) + if run_id: + result.run_id = run_id + result_dict = _camel_keys(result.model_dump(by_alias=True)) return StandardResponse(data=result_dict) @@ -557,12 +635,14 @@ async def evaluate_traces( @router.post("/evaluate/stream") async def evaluate_traces_stream( + request: Request, trace_files: list[UploadFile] = File(...), config: str = Form(...), eval_set_file: UploadFile | None = File(None), ): """Evaluate traces with real-time progress via SSE.""" temp_dir = tempfile.mkdtemp() + upload_filenames = [tf.filename for tf in trace_files if tf.filename] async def event_generator(): try: @@ -678,6 +758,16 @@ async def run_with_progress(): tag, payload = msg if tag == "done": + run_id = await _maybe_persist_evaluate_run( + request, + params=eval_config, + eval_set_dict=_load_eval_set_dict(eval_set_path), + trace_format=eval_config.trace_format, + upload_filenames=upload_filenames, + run_result=payload, + ) + if run_id: + payload.run_id = run_id evt = SSEDoneEvent( result=_camel_keys(payload.model_dump(by_alias=True)), ) @@ -768,6 +858,16 @@ async def evaluate_traces_json(request: EvaluateJsonRequest, raw_request: Reques config=request.config, eval_set=eval_set, ) + run_id = await _maybe_persist_evaluate_run( + raw_request, + params=request.config, + eval_set_dict=request.eval_set, + trace_format=None, + upload_filenames=None, + run_result=result, + ) + if run_id: + result.run_id = run_id return StandardResponse(data=_camel_keys(result.model_dump(by_alias=True))) except Exception as exc: logger.exception("JSON evaluation failed") @@ -827,6 +927,16 @@ async def run_with_progress(): tag, payload = msg if tag == "done": + run_id = await _maybe_persist_evaluate_run( + raw_request, + params=request.config, + eval_set_dict=request.eval_set, + trace_format=None, + upload_filenames=None, + run_result=payload, + ) + if run_id: + payload.run_id = run_id evt = SSEDoneEvent( result=_camel_keys(payload.model_dump(by_alias=True)), ) diff --git a/src/agentevals/api/runs_routes.py b/src/agentevals/api/runs_routes.py new file mode 100644 index 0000000..99ae71a --- /dev/null +++ b/src/agentevals/api/runs_routes.py @@ -0,0 +1,114 @@ +"""HTTP router for the async run pipeline. + +Mounted only when ``AGENTEVALS_STORAGE_BACKEND=postgres``. Submission is +idempotent on ``run_id``: re-posting the same id with an identical spec +returns the persisted row; re-posting with a different spec returns +``409 Conflict``. +""" + +from __future__ import annotations + +import logging +from datetime import datetime +from uuid import UUID + +from fastapi import APIRouter, HTTPException, Query, Request, status +from pydantic import ConfigDict +from pydantic.alias_generators import to_camel + +from ..run.service import RunService, RunSubmitConflict +from ..storage.models import Result, Run, RunSpec, RunStatus +from .models import CamelModel, StandardResponse + +logger = logging.getLogger(__name__) + +runs_router = APIRouter(tags=["runs"]) + + +class RunRequest(CamelModel): + """POST body for ``/api/runs``.""" + + run_id: UUID | None = None + spec: RunSpec + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True, extra="allow") + + +class RunSummary(CamelModel): + run_id: UUID + status: RunStatus + created_at: datetime + + +def _service(request: Request) -> RunService: + service = getattr(request.app.state, "run_service", None) + if service is None: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="run service is not configured (set AGENTEVALS_STORAGE_BACKEND=postgres)", + ) + return service + + +@runs_router.post( + "/runs", + response_model=StandardResponse[Run], + status_code=status.HTTP_202_ACCEPTED, +) +async def submit_run(payload: RunRequest, request: Request): + service = _service(request) + try: + run = await service.submit(run_id=payload.run_id, spec=payload.spec) + except RunSubmitConflict as exc: + raise HTTPException( + status_code=status.HTTP_409_CONFLICT, + detail={ + "message": "run_id already exists with a different spec", + "persisted": exc.persisted.model_dump(mode="json", by_alias=True), + }, + ) from exc + return StandardResponse(data=run) + + +@runs_router.get("/runs/{run_id}", response_model=StandardResponse[Run]) +async def get_run(run_id: UUID, request: Request): + service = _service(request) + run = await service.get(run_id) + if run is None: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"run {run_id} not found") + return StandardResponse(data=run) + + +@runs_router.get("/runs", response_model=StandardResponse[list[Run]]) +async def list_runs( + request: Request, + status_filter: list[RunStatus] | None = Query(default=None, alias="status"), + limit: int = Query(default=100, ge=1, le=1000), + before: datetime | None = Query(default=None), +): + service = _service(request) + runs = await service.list(status=status_filter, limit=limit, before=before) + return StandardResponse(data=runs) + + +@runs_router.get("/runs/{run_id}/results", response_model=StandardResponse[list[Result]]) +async def list_run_results(run_id: UUID, request: Request): + service = _service(request) + run = await service.get(run_id) + if run is None: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"run {run_id} not found") + results = await service.list_results(run_id) + return StandardResponse(data=results) + + +@runs_router.post("/runs/{run_id}/cancel", response_model=StandardResponse[Run]) +async def cancel_run(run_id: UUID, request: Request): + service = _service(request) + cancelled = await service.cancel(run_id) + run = await service.get(run_id) + if run is None: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"run {run_id} not found") + if not cancelled and run.status not in (RunStatus.QUEUED, RunStatus.RUNNING): + # Already terminal; surface that to the caller without an error. + return StandardResponse(data=run) + return StandardResponse(data=run) diff --git a/src/agentevals/cli.py b/src/agentevals/cli.py index 9f1c37e..666cce8 100644 --- a/src/agentevals/cli.py +++ b/src/agentevals/cli.py @@ -729,6 +729,147 @@ def serve( asyncio.run(_run_servers(host, port, otlp_http_port, otlp_grpc_port, mcp_port=mcp_port)) +# --------------------------------------------------------------------------- +# agentevals migrate ... +# --------------------------------------------------------------------------- + + +@main.group("migrate") +def migrate_group() -> None: + """Manage the Postgres schema for AGENTEVALS_STORAGE_BACKEND=postgres.""" + + +def _migrator_or_die() -> "object": + from pydantic import ValidationError + + from .storage.config import StorageSettings + from .storage.postgres.migrator import Migrator + + try: + settings = StorageSettings.from_env() + except ValidationError as exc: + # Extract the first inner message so CLI users see "AGENTEVALS_..." rather + # than the multi-line Pydantic dump. + first = exc.errors()[0] if exc.errors() else {"msg": str(exc)} + raise click.ClickException(first.get("msg", str(exc))) from exc + except Exception as exc: + raise click.ClickException(str(exc)) from exc + if not settings.database_url: + raise click.ClickException("AGENTEVALS_DATABASE_URL is required for migrations") + return Migrator( + dsn=settings.database_url, + schema=settings.schema_name, + lock_timeout_s=settings.migrate_lock_timeout_s, + ) + + +@migrate_group.command("up") +@click.option("--dry-run", is_flag=True, help="Print which migrations would apply without executing.") +def migrate_up(dry_run: bool) -> None: + """Apply all pending migrations.""" + migrator = _migrator_or_die() + try: + applied = asyncio.run(migrator.up(dry_run=dry_run)) + except Exception as exc: + raise click.ClickException(f"migration failed: {exc}") from exc + if not applied: + click.echo("Nothing to apply.") + else: + verb = "Would apply" if dry_run else "Applied" + for v in applied: + click.echo(f"{verb} {v:06d}") + + +@migrate_group.command("down") +@click.option("--steps", type=int, required=True, help="Number of migrations to roll back (>= 1).") +@click.confirmation_option( + prompt="Rolling back migrations is destructive and may delete data. Continue?", +) +def migrate_down(steps: int) -> None: + """Roll back the last N migrations. Prints SQL for each step before executing.""" + migrator = _migrator_or_die() + try: + rolled = asyncio.run(migrator.down(steps=steps)) + except Exception as exc: + raise click.ClickException(f"rollback failed: {exc}") from exc + if not rolled: + click.echo("Nothing to roll back.") + else: + for version, name in rolled: + click.echo(f"Rolled back {version:06d}_{name}") + + +@migrate_group.command("version") +def migrate_version() -> None: + """Print the current schema version and the dirty flag.""" + migrator = _migrator_or_die() + status = asyncio.run(migrator.status()) + if status.version is None: + click.echo("schema not initialized (no migrations applied)") + else: + click.echo(f"version={status.version:06d} dirty={status.dirty}") + + +@migrate_group.command("force") +@click.argument("version", type=int) +def migrate_force(version: int) -> None: + """Set the schema version and clear the dirty flag. Recovery only. + + Use after fixing a partially-applied migration manually. Does not run any + SQL; only updates the schema_migrations row. + """ + migrator = _migrator_or_die() + asyncio.run(migrator.force(version)) + click.echo(f"forced version={version:06d} dirty=False") + + +@migrate_group.command("create") +@click.argument("name") +@click.option( + "--output-dir", + "-o", + type=click.Path(file_okay=False), + default=None, + help="Where to write the new files (defaults to the in-tree migrations directory).", +) +def migrate_create(name: str, output_dir: str | None) -> None: + """Generate an empty NNNNNN_.up.sql + .down.sql pair.""" + import re as _re + from pathlib import Path as _Path + + if not _re.match(r"^[a-z0-9_]+$", name): + raise click.ClickException("name must match [a-z0-9_]+") + + from .storage.postgres.migrator import discover_migrations + + if output_dir is None: + repo_path = _Path(__file__).resolve().parent / "storage" / "postgres" / "migrations" + if not repo_path.is_dir(): + raise click.ClickException( + f"migrations dir not found at {repo_path} (run 'create' from a checkout, not an installed wheel)" + ) + target = repo_path + else: + target = _Path(output_dir) + target.mkdir(parents=True, exist_ok=True) + + existing = discover_migrations() + next_version = (max((m.version for m in existing), default=0) + 1) if existing else 1 + up_path = target / f"{next_version:06d}_{name}.up.sql" + down_path = target / f"{next_version:06d}_{name}.down.sql" + if up_path.exists() or down_path.exists(): + raise click.ClickException(f"{up_path.name} or {down_path.name} already exists") + + header = ( + f"-- Migration {next_version:06d}: {name}\n" + "-- Once tagged in a release this file is immutable. Fix bugs by adding a NEW migration.\n\n" + ) + up_path.write_text(header) + down_path.write_text(header) + click.echo(f"Created {up_path}") + click.echo(f"Created {down_path}") + + @main.command("mcp") @click.option( "--server-url", diff --git a/src/agentevals/run/__init__.py b/src/agentevals/run/__init__.py new file mode 100644 index 0000000..1a0e482 --- /dev/null +++ b/src/agentevals/run/__init__.py @@ -0,0 +1,9 @@ +"""Async run pipeline for ``POST /api/runs``. + +Contents: +- :mod:`fetcher` resolves a run spec's ``target`` into a list of traces. +- :mod:`sinks` fan-out result delivery (stdout, file, http_webhook). +- :mod:`service` is the synchronous control surface used by HTTP handlers. +- :mod:`worker` is the in-process loop that claims runs and drives the + existing :func:`agentevals.runner.run_evaluation_from_traces` pipeline. +""" diff --git a/src/agentevals/run/fetcher.py b/src/agentevals/run/fetcher.py new file mode 100644 index 0000000..34f8bae --- /dev/null +++ b/src/agentevals/run/fetcher.py @@ -0,0 +1,83 @@ +"""Trace fetchers — resolve a run spec's ``target`` into a list of Trace objects. + +Two implementations ship: ``inline`` (the JSON payload is embedded in the +spec) and ``http`` (the worker GETs ``{base_url}/{trace_id}`` with headers +sourced from ``context.headers``). Auth headers are pass-through; this layer +does not validate them. +""" + +from __future__ import annotations + +import json +import logging +import tempfile +from pathlib import Path +from typing import Protocol + +import httpx + +from ..loader import load_traces +from ..loader.base import Trace +from ..storage.models import TraceTarget + +logger = logging.getLogger(__name__) + + +class TraceFetcher(Protocol): + async def fetch(self, target: TraceTarget, context: dict) -> list[Trace]: ... + + +class InlineTraceFetcher: + """Materializes inline JSON to a temp file and parses it via the existing loader. + + The temp file dance reuses :func:`agentevals.loader.load_traces` (which + auto-detects format) without a special-case in the loader for dict input. + """ + + async def fetch(self, target: TraceTarget, context: dict) -> list[Trace]: + if not target.inline: + raise ValueError("InlineTraceFetcher requires target.inline to be set") + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(target.inline, f) + path = Path(f.name) + try: + return load_traces(str(path), format=target.trace_format) + finally: + path.unlink(missing_ok=True) # noqa: ASYNC240 + + +class HttpTraceFetcher: + """Fetches the trace JSON over HTTP. Auth is opaque header pass-through.""" + + def __init__(self, timeout_s: float = 30.0) -> None: + self._timeout_s = timeout_s + + async def fetch(self, target: TraceTarget, context: dict) -> list[Trace]: + if not target.base_url or not target.trace_id: + raise ValueError("HttpTraceFetcher requires target.base_url and target.trace_id") + url = target.base_url.rstrip("/") + "/" + target.trace_id + headers = (context.get("headers") if isinstance(context, dict) else {}) or {} + async with httpx.AsyncClient(timeout=self._timeout_s) as client: + resp = await client.get(url, headers=headers) + resp.raise_for_status() + payload = resp.json() + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(payload, f) + path = Path(f.name) + try: + return load_traces(str(path), format=target.trace_format) + finally: + path.unlink(missing_ok=True) # noqa: ASYNC240 + + +def resolve_fetcher(target: TraceTarget) -> TraceFetcher: + if target.kind == "inline": + return InlineTraceFetcher() + if target.kind == "http": + return HttpTraceFetcher() + if target.kind == "uploaded": + raise ValueError( + "target kind 'uploaded' records a synchronous /api/evaluate call and cannot be " + "re-executed by the worker; the run already completed at submission time" + ) + raise ValueError(f"unknown trace target kind '{target.kind}'") diff --git a/src/agentevals/run/result_builder.py b/src/agentevals/run/result_builder.py new file mode 100644 index 0000000..5d30d74 --- /dev/null +++ b/src/agentevals/run/result_builder.py @@ -0,0 +1,82 @@ +"""Shared helpers that project a :class:`agentevals.runner.RunResult` onto +the persisted shapes (:class:`agentevals.storage.models.Result` rows + a +JSON ``summary`` blob). + +Used both by the async worker (when a queued run finishes) and by the +``/api/evaluate`` route handler (when a synchronous UI upload finishes), so +both paths produce identical persisted shapes. +""" + +from __future__ import annotations + +from typing import Any, Literal +from uuid import UUID + +from ..config import EvalParams +from ..runner import RunResult +from ..storage.models import Result + +EvaluatorType = Literal["builtin", "code", "remote", "openai_eval"] + + +def classify_evaluator(metric_name: str, params: EvalParams) -> EvaluatorType: + """Look up whether a metric was a built-in or a custom evaluator, + falling back to ``builtin`` so unknown names round-trip cleanly rather + than raising during persistence.""" + for ce in params.custom_evaluators: + if ce.name == metric_name: + return ce.type + return "builtin" + + +def build_results(run_id: UUID, params: EvalParams, run_result: RunResult) -> list[Result]: + """Flatten ``run_result.trace_results[*].metric_results[*]`` into a list + of persistable :class:`Result` rows. + + The ``eval_set_item_id`` and ``eval_set_item_name`` both default to the + trace_id, since OSS doesn't currently extract a stable per-eval-case + identifier from the ADK :class:`EvalSet`. Callers may post-process to + attach their own identifiers. + """ + out: list[Result] = [] + for trace_result in run_result.trace_results: + item_id = trace_result.trace_id + item_name = trace_result.trace_id + for mr in trace_result.metric_results: + out.append( + Result.from_metric_result( + run_id=run_id, + eval_set_item_id=item_id, + eval_set_item_name=item_name, + trace_id=trace_result.trace_id, + evaluator_type=classify_evaluator(mr.metric_name, params), + metric_result=mr, + ) + ) + return out + + +def summarize_run_result(run_result: RunResult) -> dict[str, Any]: + """Summary blob persisted alongside the run row. + + Counts mirror :class:`agentevals.storage.models.ResultStatus` values so a + caller polling ``GET /api/runs/{id}`` can compute pass/fail rates without + fetching the full result list. + """ + counts = {"passed": 0, "failed": 0, "errored": 0, "skipped": 0} + for tr in run_result.trace_results: + for mr in tr.metric_results: + if mr.error: + counts["errored"] += 1 + elif (mr.eval_status or "").upper() == "PASSED": + counts["passed"] += 1 + elif (mr.eval_status or "").upper() == "FAILED": + counts["failed"] += 1 + else: + counts["skipped"] += 1 + return { + "trace_count": len(run_result.trace_results), + "result_counts": counts, + "errors": list(run_result.errors), + "performance_metrics": run_result.performance_metrics, + } diff --git a/src/agentevals/run/service.py b/src/agentevals/run/service.py new file mode 100644 index 0000000..e0b3f36 --- /dev/null +++ b/src/agentevals/run/service.py @@ -0,0 +1,127 @@ +"""Synchronous control surface used by ``/api/runs`` HTTP handlers. + +Wraps the :class:`agentevals.storage.repos.RunRepository` with submit +idempotency, list pagination, and the 409 spec-mismatch path. + +Also provides :meth:`RunService.record_completed_eval` for the +``/api/evaluate`` path: that handler executes synchronously (the trace was +already supplied as multipart and the result is being streamed back over +SSE), so we synthesize a Run row for visibility in run history rather than +queueing work for the worker. +""" + +from __future__ import annotations + +import json +import logging +from datetime import datetime, timezone +from uuid import UUID, uuid4 + +from ..config import EvalParams +from ..runner import RunResult +from ..storage.models import Run, RunSpec, RunStatus +from ..storage.repos import ResultRepository, RunRepository +from .result_builder import build_results, summarize_run_result + +logger = logging.getLogger(__name__) + + +def _now() -> datetime: + return datetime.now(timezone.utc) + + +class RunSubmitConflict(Exception): + """Raised when a re-submission's spec differs from the persisted one. + + The caller (HTTP handler) maps this to ``409 Conflict`` and returns the + persisted run so the client can reconcile. + """ + + def __init__(self, persisted: Run) -> None: + super().__init__(f"run {persisted.run_id} already exists with a different spec") + self.persisted = persisted + + +class RunService: + def __init__(self, runs: RunRepository, results: ResultRepository) -> None: + self._runs = runs + self._results = results + + async def submit(self, *, run_id: UUID | None, spec: RunSpec) -> Run: + run = Run( + run_id=run_id or uuid4(), + status=RunStatus.QUEUED, + spec=spec, + ) + persisted = await self._runs.create(run) + if persisted.run_id == run.run_id and not _specs_equal(persisted.spec, spec): + raise RunSubmitConflict(persisted) + return persisted + + async def get(self, run_id: UUID) -> Run | None: + return await self._runs.get(run_id) + + async def list( + self, + *, + status: list[RunStatus] | None = None, + limit: int = 100, + before: datetime | None = None, + ) -> list[Run]: + return await self._runs.list(status=status, limit=limit, before=before) + + async def list_results(self, run_id: UUID): + return await self._results.list_by_run(run_id) + + async def cancel(self, run_id: UUID) -> bool: + return await self._runs.cancel(run_id) + + async def record_completed_eval( + self, + *, + spec: RunSpec, + params: EvalParams, + run_result: RunResult, + ) -> Run: + """Persist a synchronously-completed eval as a Run row plus Result rows. + + The run is created already in ``running`` state (so the row passes the + ``run_running_has_worker`` check is sidestepped via a synthetic worker + id), then transitioned to a terminal state in the same call. Two + writes per eval, but using the public :class:`RunRepository` API + avoids leaking an executor-only schema requirement into this layer. + """ + run_id = uuid4() + worker_id = "sync:/api/evaluate" + run = Run( + run_id=run_id, + status=RunStatus.QUEUED, + spec=spec, + attempt=1, + worker_id=worker_id, + started_at=_now(), + ) + await self._runs.create(run) + + results = build_results(run_id, params, run_result) + await self._results.upsert_many(run_id, results) + + summary = summarize_run_result(run_result) + if run_result.errors: + error = "; ".join(run_result.errors[:3]) + await self._runs.update_status(run_id, RunStatus.FAILED, error=error, summary=summary) + run.status = RunStatus.FAILED + run.error = error + else: + await self._runs.update_status(run_id, RunStatus.SUCCEEDED, summary=summary) + run.status = RunStatus.SUCCEEDED + run.summary = summary + return run + + +def _specs_equal(a: RunSpec, b: RunSpec) -> bool: + """Deep equality on the JSON projection. Pydantic equality compares model + instances by class identity, which trips up the round-trip from JSONB.""" + return json.dumps(a.model_dump(by_alias=False), sort_keys=True) == json.dumps( + b.model_dump(by_alias=False), sort_keys=True + ) diff --git a/src/agentevals/run/sinks.py b/src/agentevals/run/sinks.py new file mode 100644 index 0000000..d12eac1 --- /dev/null +++ b/src/agentevals/run/sinks.py @@ -0,0 +1,230 @@ +"""Result sinks — best-effort fan-out of run results. + +The :class:`agentevals.storage.repos.ResultRepository` is always written; +sinks are an additional delivery channel. Sink failures are logged with +``run_id`` / ``result_id`` but do not fail the run. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import os +import sys +from pathlib import Path +from typing import Any, Protocol +from uuid import UUID + +import httpx + +from ..storage.models import Result + +logger = logging.getLogger(__name__) + + +class ResultSink(Protocol): + async def emit_partial(self, run_id: UUID, results: list[Result], attempt: int) -> None: ... + async def emit_final(self, run_id: UUID, summary: dict, attempt: int) -> None: ... + async def emit_error(self, run_id: UUID, error: str, attempt: int) -> None: ... + + +def _result_payload(r: Result) -> dict: + return r.model_dump(mode="json", by_alias=True) + + +class StdoutSink: + async def emit_partial(self, run_id: UUID, results: list[Result], attempt: int) -> None: + for r in results: + sys.stdout.write( + json.dumps({"phase": "partial", "run_id": str(run_id), "result": _result_payload(r)}) + "\n" + ) + sys.stdout.flush() + + async def emit_final(self, run_id: UUID, summary: dict, attempt: int) -> None: + sys.stdout.write(json.dumps({"phase": "final", "run_id": str(run_id), "summary": summary}) + "\n") + sys.stdout.flush() + + async def emit_error(self, run_id: UUID, error: str, attempt: int) -> None: + sys.stdout.write(json.dumps({"phase": "error", "run_id": str(run_id), "error": error}) + "\n") + sys.stdout.flush() + + +class FileSink: + """Append-only newline-delimited JSON. Each event is one line.""" + + def __init__(self, path: str | Path) -> None: + self._path = Path(path) + self._lock = asyncio.Lock() + + async def _write(self, payload: dict) -> None: + async with self._lock: + self._path.parent.mkdir(parents=True, exist_ok=True) + with self._path.open("a") as f: # noqa: ASYNC230 + f.write(json.dumps(payload) + "\n") + + async def emit_partial(self, run_id: UUID, results: list[Result], attempt: int) -> None: + for r in results: + await self._write({"phase": "partial", "run_id": str(run_id), "result": _result_payload(r)}) + + async def emit_final(self, run_id: UUID, summary: dict, attempt: int) -> None: + await self._write({"phase": "final", "run_id": str(run_id), "summary": summary}) + + async def emit_error(self, run_id: UUID, error: str, attempt: int) -> None: + await self._write({"phase": "error", "run_id": str(run_id), "error": error}) + + +class HttpWebhookSink: + """POST JSON to a URL with retries. + + Auth headers come from the spec via ``headers`` (literal values) or + ``headers_from_env`` (env var names whose values are read at emit time). + Reading at emit time means a host can rotate the env var without + restarting agentevals. + """ + + def __init__( + self, + url: str, + *, + headers: dict[str, str] | None = None, + headers_from_env: dict[str, str] | None = None, + timeout_s: float = 10.0, + max_attempts: int = 5, + ) -> None: + self._url = url + self._headers = headers or {} + self._headers_from_env = headers_from_env or {} + self._timeout_s = timeout_s + self._max_attempts = max_attempts + + def _resolve_headers(self) -> dict[str, str]: + merged = dict(self._headers) + for header, env_var in self._headers_from_env.items(): + value = os.environ.get(env_var) + if value is not None: + merged[header] = value + merged.setdefault("Content-Type", "application/json") + return merged + + async def _post(self, payload: dict) -> None: + delay = 0.5 + last_exc: Exception | None = None + for attempt in range(1, self._max_attempts + 1): + try: + async with httpx.AsyncClient(timeout=self._timeout_s) as client: + resp = await client.post(self._url, json=payload, headers=self._resolve_headers()) + if resp.status_code < 500: + if resp.status_code >= 400: + logger.warning( + "Webhook %s returned %d: %s (run_id=%s)", + self._url, + resp.status_code, + resp.text[:200], + payload.get("run_id"), + ) + return + last_exc = RuntimeError(f"HTTP {resp.status_code}: {resp.text[:200]}") + except (httpx.HTTPError, RuntimeError) as exc: + last_exc = exc + if attempt < self._max_attempts: + await asyncio.sleep(delay) + delay = min(delay * 2, 10.0) + logger.error( + "Webhook %s failed after %d attempts: %s (run_id=%s)", + self._url, + self._max_attempts, + last_exc, + payload.get("run_id"), + ) + + async def emit_partial(self, run_id: UUID, results: list[Result], attempt: int) -> None: + await self._post( + { + "phase": "partial", + "run_id": str(run_id), + "attempt": attempt, + "results": [_result_payload(r) for r in results], + } + ) + + async def emit_final(self, run_id: UUID, summary: dict, attempt: int) -> None: + await self._post({"phase": "final", "run_id": str(run_id), "attempt": attempt, "summary": summary}) + + async def emit_error(self, run_id: UUID, error: str, attempt: int) -> None: + await self._post({"phase": "error", "run_id": str(run_id), "attempt": attempt, "error": error}) + + +class SinkFanout: + """Runs sinks in parallel. Failures are isolated per sink.""" + + def __init__(self, sinks: list[ResultSink]) -> None: + self._sinks = sinks + + async def emit_partial(self, run_id: UUID, results: list[Result], attempt: int) -> None: + await asyncio.gather( + *(self._guard(s.emit_partial(run_id, results, attempt), "partial") for s in self._sinks), + return_exceptions=False, + ) + + async def emit_final(self, run_id: UUID, summary: dict, attempt: int) -> None: + await asyncio.gather( + *(self._guard(s.emit_final(run_id, summary, attempt), "final") for s in self._sinks), + return_exceptions=False, + ) + + async def emit_error(self, run_id: UUID, error: str, attempt: int) -> None: + await asyncio.gather( + *(self._guard(s.emit_error(run_id, error, attempt), "error") for s in self._sinks), + return_exceptions=False, + ) + + @staticmethod + async def _guard(coro: Any, phase: str) -> None: + try: + await coro + except Exception: + logger.exception("sink delivery failed in phase=%s", phase) + + +def build_sinks(specs: list[dict]) -> SinkFanout: + """Construct a fan-out from the run spec's ``sinks`` array. + + Each spec is a dict with ``kind`` plus kind-specific args. Unknown kinds + are skipped with a warning so a future kind added by a host doesn't + break older agentevals replicas mid-rollout. + """ + sinks: list[ResultSink] = [] + for spec in specs: + kind = spec.get("kind") + if kind == "stdout": + sinks.append(StdoutSink()) + elif kind == "file": + sinks.append(FileSink(spec["path"])) + elif kind == "http_webhook": + sinks.append( + HttpWebhookSink( + url=spec["url"], + headers=spec.get("headers"), + headers_from_env=spec.get("headers_from_env") or _extract_env_headers(spec.get("auth")), + timeout_s=float(spec.get("timeout_s", 10.0)), + max_attempts=int(spec.get("max_attempts", 5)), + ) + ) + else: + logger.warning("unknown sink kind '%s'; skipping", kind) + return SinkFanout(sinks) + + +def _extract_env_headers(auth: Any) -> dict[str, str]: + """Map the design-doc shape ``auth.headers..from_env`` to env-var lookups.""" + result: dict[str, str] = {} + if not isinstance(auth, dict): + return result + headers = auth.get("headers") if auth.get("kind") == "headers" else None + if not isinstance(headers, dict): + return result + for header_name, value in headers.items(): + if isinstance(value, dict) and "from_env" in value: + result[header_name] = value["from_env"] + return result diff --git a/src/agentevals/run/worker.py b/src/agentevals/run/worker.py new file mode 100644 index 0000000..0f2562d --- /dev/null +++ b/src/agentevals/run/worker.py @@ -0,0 +1,188 @@ +"""Async run worker. + +A pool of asyncio tasks each loop on ``run_repo.claim_next``, heartbeat the +lease while executing, and drive the existing +:func:`agentevals.runner.run_evaluation_from_traces` pipeline. + +Cancellation is signaled by setting ``run.cancel_requested`` via +``POST /api/runs/{id}/cancel``. The heartbeat task observes the flag on each +tick and cancels the worker task; the worker catches and finalizes the run +as ``cancelled``. +""" + +from __future__ import annotations + +import asyncio +import logging +import socket +from datetime import datetime, timedelta, timezone +from uuid import UUID + +from google.adk.evaluation.eval_set import EvalSet + +from ..config import EvalParams +from ..runner import RunResult, TraceResult, run_evaluation_from_traces +from ..storage.config import StorageSettings +from ..storage.models import Run, RunStatus +from ..storage.repos import ResultRepository, RunRepository +from .fetcher import resolve_fetcher +from .result_builder import build_results, summarize_run_result +from .sinks import SinkFanout, build_sinks + +logger = logging.getLogger(__name__) + + +def _now() -> datetime: + return datetime.now(timezone.utc) + + +class _CancelledByRequest(Exception): + """Raised inside the worker task when the heartbeat observes cancel_requested.""" + + +class AsyncRunWorker: + """Manages the worker task pool. ``start()`` spawns N loops; ``stop()`` + cancels them and waits for graceful shutdown.""" + + def __init__( + self, + *, + runs: RunRepository, + results: ResultRepository, + settings: StorageSettings, + ) -> None: + self._runs = runs + self._results = results + self._settings = settings + self._tasks: list[asyncio.Task] = [] + self._stopping = asyncio.Event() + self._worker_id_prefix = f"{socket.gethostname()}/{id(self):x}" + + async def start(self) -> None: + self._stopping.clear() + for i in range(self._settings.max_concurrent_runs): + wid = f"{self._worker_id_prefix}/{i}" + self._tasks.append(asyncio.create_task(self._loop(wid), name=f"agentevals-worker-{i}")) + logger.info( + "Started %d run worker(s) (lease=%ds, heartbeat=%ds, deadline=%ds)", + self._settings.max_concurrent_runs, + self._settings.lease_s, + self._settings.heartbeat_s, + self._settings.run_deadline_s, + ) + + async def stop(self) -> None: + self._stopping.set() + for t in self._tasks: + t.cancel() + if self._tasks: + await asyncio.gather(*self._tasks, return_exceptions=True) + self._tasks.clear() + logger.info("Run workers stopped") + + async def _loop(self, worker_id: str) -> None: + lease = timedelta(seconds=self._settings.lease_s) + poll = self._settings.worker_poll_interval_s + while not self._stopping.is_set(): + try: + run = await self._runs.claim_next( + worker_id=worker_id, + lease=lease, + max_attempts=self._settings.max_run_attempts, + ) + except asyncio.CancelledError: + return + except Exception: + logger.exception("claim_next failed; backing off") + await asyncio.sleep(min(poll * 5, 30.0)) + continue + + if run is None: + try: + await asyncio.sleep(poll) + except asyncio.CancelledError: + return + continue + + await self._execute(run, worker_id) + + async def _execute(self, run: Run, worker_id: str) -> None: + logger.info("worker=%s claimed run=%s (attempt=%d)", worker_id, run.run_id, run.attempt) + cancel_event = asyncio.Event() + hb_task = asyncio.create_task(self._heartbeat(run.run_id, worker_id, cancel_event)) + sinks = build_sinks(run.spec.sinks or []) + try: + await self._run_evaluation(run, sinks, cancel_event) + except asyncio.CancelledError: + await self._runs.update_status(run.run_id, RunStatus.CANCELLED, error="worker cancelled") + await sinks.emit_error(run.run_id, "worker cancelled", run.attempt) + raise + except _CancelledByRequest: + logger.info("run=%s cancelled by request", run.run_id) + await self._runs.update_status(run.run_id, RunStatus.CANCELLED, error="cancelled by request") + await sinks.emit_error(run.run_id, "cancelled by request", run.attempt) + except TimeoutError: + logger.warning("run=%s exceeded deadline of %ds", run.run_id, self._settings.run_deadline_s) + await self._runs.update_status(run.run_id, RunStatus.FAILED, error="deadline_exceeded") + await sinks.emit_error(run.run_id, "deadline_exceeded", run.attempt) + except Exception as exc: + logger.exception("run=%s failed", run.run_id) + await self._runs.update_status(run.run_id, RunStatus.FAILED, error=str(exc)) + await sinks.emit_error(run.run_id, str(exc), run.attempt) + finally: + hb_task.cancel() + try: + await hb_task + except (asyncio.CancelledError, Exception): + pass + + async def _run_evaluation(self, run: Run, sinks: SinkFanout, cancel_event: asyncio.Event) -> None: + params = EvalParams.model_validate(run.spec.eval_config or {}) + eval_set: EvalSet | None = None + if run.spec.eval_set: + eval_set = EvalSet.model_validate(run.spec.eval_set) + + fetcher = resolve_fetcher(run.spec.target) + + async def _trace_progress(trace_result: TraceResult) -> None: + partial = build_results(run.run_id, params, RunResult(trace_results=[trace_result])) + await self._results.upsert_many(run.run_id, partial) + await sinks.emit_partial(run.run_id, partial, run.attempt) + if cancel_event.is_set(): + raise _CancelledByRequest() + + async with asyncio.timeout(self._settings.run_deadline_s): + traces = await fetcher.fetch(run.spec.target, run.spec.context) + if cancel_event.is_set(): + raise _CancelledByRequest() + run_result = await run_evaluation_from_traces( + traces=traces, + config=params, + eval_set=eval_set, + trace_progress_callback=_trace_progress, + ) + + results = build_results(run.run_id, params, run_result) + await self._results.upsert_many(run.run_id, results) + summary = summarize_run_result(run_result) + await sinks.emit_final(run.run_id, summary, run.attempt) + await self._runs.update_status(run.run_id, RunStatus.SUCCEEDED, summary=summary) + logger.info( + "run=%s succeeded (traces=%d, results=%d)", + run.run_id, + len(run_result.trace_results), + len(results), + ) + + async def _heartbeat(self, run_id: UUID, worker_id: str, cancel_event: asyncio.Event) -> None: + lease = timedelta(seconds=self._settings.lease_s) + interval = self._settings.heartbeat_s + try: + while True: + await asyncio.sleep(interval) + alive = await self._runs.heartbeat(run_id, worker_id, lease) + if not alive: + cancel_event.set() + return + except asyncio.CancelledError: + return diff --git a/src/agentevals/runner.py b/src/agentevals/runner.py index 7b5c5fc..5e4f634 100644 --- a/src/agentevals/runner.py +++ b/src/agentevals/runner.py @@ -59,6 +59,7 @@ class RunResult(BaseModel): trace_results: list[TraceResult] = Field(default_factory=list) errors: list[str] = Field(default_factory=list) performance_metrics: dict[str, Any] | None = None + run_id: str | None = None def load_eval_set(path: str) -> EvalSet: diff --git a/src/agentevals/storage/__init__.py b/src/agentevals/storage/__init__.py new file mode 100644 index 0000000..49a35b5 --- /dev/null +++ b/src/agentevals/storage/__init__.py @@ -0,0 +1,48 @@ +"""Storage abstractions for agentevals. + +Two backends ship: ``memory`` (default, preserves zero-config developer +experience) and ``postgres`` (durable runs/results, enables ``/api/runs``). + +The public surface is :class:`Repos`, a small bundle of repository +implementations selected by :class:`StorageSettings.backend`. +""" + +from __future__ import annotations + +from .config import StorageSettings +from .models import Result, ResultStatus, Run, RunSpec, RunStatus, TraceTarget +from .repos import Repos, ResultRepository, RunRepository, SessionRepository + +__all__ = [ + "Repos", + "Result", + "ResultRepository", + "ResultStatus", + "Run", + "RunRepository", + "RunSpec", + "RunStatus", + "SessionRepository", + "StorageSettings", + "TraceTarget", + "build_repos", +] + + +async def build_repos(settings: StorageSettings) -> Repos: + """Construct the repository bundle for ``settings.backend``. + + Memory backend instantiates dict-backed repos eagerly. Postgres backend + creates an asyncpg pool, applies pending migrations, then wires repos + against that pool. + """ + if settings.backend == "memory": + from .repos.memory import MemoryRepos + + return MemoryRepos.create() + + from .postgres.pool import create_pool + from .repos.postgres import PostgresRepos + + pool = await create_pool(settings) + return await PostgresRepos.create(pool=pool, schema=settings.schema_name) diff --git a/src/agentevals/storage/config.py b/src/agentevals/storage/config.py new file mode 100644 index 0000000..62beb77 --- /dev/null +++ b/src/agentevals/storage/config.py @@ -0,0 +1,76 @@ +"""Storage configuration loaded from AGENTEVALS_* env vars.""" + +from __future__ import annotations + +import os +from typing import Literal + +from pydantic import BaseModel, Field, field_validator + +Backend = Literal["memory", "postgres"] + + +class StorageSettings(BaseModel): + """Runtime storage knobs. + + Read from environment in :meth:`from_env`. Defaults preserve the + pre-existing in-memory developer experience: no Postgres required, no + ``/api/runs`` endpoints registered. + """ + + backend: Backend = "memory" + database_url: str | None = None + schema_name: str = "agentevals" + migrate_lock_timeout_s: int = 60 + + max_concurrent_runs: int = Field(default=4, ge=1) + run_deadline_s: int = Field(default=300, ge=1) + heartbeat_s: int = Field(default=5, ge=1) + lease_s: int = Field(default=30, ge=1) + max_run_attempts: int = Field(default=3, ge=1) + worker_poll_interval_s: float = Field(default=1.0, gt=0) + + @field_validator("backend") + @classmethod + def _validate_backend(cls, v: Backend) -> Backend: + if v not in ("memory", "postgres"): + raise ValueError(f"unknown storage backend '{v}'; expected 'memory' or 'postgres'") + return v + + def model_post_init(self, __context: object) -> None: + if self.lease_s <= self.heartbeat_s: + raise ValueError( + f"AGENTEVALS_LEASE_S ({self.lease_s}) must be greater than AGENTEVALS_HEARTBEAT_S ({self.heartbeat_s})" + ) + if self.backend == "postgres" and not self.database_url: + raise ValueError("AGENTEVALS_STORAGE_BACKEND=postgres requires AGENTEVALS_DATABASE_URL") + + @classmethod + def from_env(cls) -> StorageSettings: + return cls( + backend=os.environ.get("AGENTEVALS_STORAGE_BACKEND", "memory"), + database_url=_read_dsn_from_env(), + schema_name=os.environ.get("AGENTEVALS_DATABASE_SCHEMA", "agentevals"), + migrate_lock_timeout_s=int(os.environ.get("AGENTEVALS_MIGRATE_LOCK_TIMEOUT", "60")), + max_concurrent_runs=int(os.environ.get("AGENTEVALS_MAX_CONCURRENT_RUNS", "4")), + run_deadline_s=int(os.environ.get("AGENTEVALS_RUN_DEADLINE_S", "300")), + heartbeat_s=int(os.environ.get("AGENTEVALS_HEARTBEAT_S", "5")), + lease_s=int(os.environ.get("AGENTEVALS_LEASE_S", "30")), + max_run_attempts=int(os.environ.get("AGENTEVALS_MAX_RUN_ATTEMPTS", "3")), + worker_poll_interval_s=float(os.environ.get("AGENTEVALS_WORKER_POLL_INTERVAL_S", "1.0")), + ) + + +def _read_dsn_from_env() -> str | None: + """Return the DSN with AGENTEVALS_DATABASE_URL_FILE preferred over the + inline AGENTEVALS_DATABASE_URL. The file path is intended for projected + workload-identity tokens or other secret rotators that prefer a file + surface to an env var.""" + file_path = os.environ.get("AGENTEVALS_DATABASE_URL_FILE") + if file_path: + try: + with open(file_path) as f: + return f.read().strip() or None + except OSError as exc: + raise ValueError(f"AGENTEVALS_DATABASE_URL_FILE={file_path!r} is unreadable: {exc}") from exc + return os.environ.get("AGENTEVALS_DATABASE_URL") diff --git a/src/agentevals/storage/models.py b/src/agentevals/storage/models.py new file mode 100644 index 0000000..fa38636 --- /dev/null +++ b/src/agentevals/storage/models.py @@ -0,0 +1,169 @@ +"""Pydantic models for persisted Run and Result rows. + +These shapes are the durable, host-facing contract returned by ``/api/runs`` +and emitted via :class:`ResultSink`. They are deliberately distinct from the +in-pipeline :class:`agentevals.runner.MetricResult` so renaming the persisted +fields (``status``, ``error_text``, ``latency_ms``) does not break the existing +``/api/evaluate`` SSE consumers. +""" + +from __future__ import annotations + +import hashlib +from datetime import datetime, timezone +from enum import Enum +from typing import Any, Literal +from uuid import UUID + +from pydantic import BaseModel, ConfigDict, Field +from pydantic.alias_generators import to_camel + + +class RunStatus(str, Enum): + QUEUED = "queued" + RUNNING = "running" + SUCCEEDED = "succeeded" + FAILED = "failed" + CANCELLED = "cancelled" + + +class ResultStatus(str, Enum): + PASSED = "passed" + FAILED = "failed" + ERRORED = "errored" + SKIPPED = "skipped" + + +def _now() -> datetime: + return datetime.now(timezone.utc) + + +def compute_result_id(run_id: UUID | str, eval_set_item_id: str, evaluator_name: str) -> str: + """Canonical SHA-256 of ``{run_id}|{eval_set_item_id}|{evaluator_name}``. + + Deterministic so both retried webhook posts and retried executor runs + deduplicate cleanly via INSERT ... ON CONFLICT (result_id) DO UPDATE. + """ + run_id_str = str(run_id).lower() if isinstance(run_id, UUID) else str(run_id).lower() + payload = f"{run_id_str}|{eval_set_item_id}|{evaluator_name}".encode() + return hashlib.sha256(payload).hexdigest() + + +class TraceTarget(BaseModel): + """Where a run gets its trace from. + + Discriminated by ``kind``: + - ``inline``: the OTLP/Jaeger JSON dict is embedded directly in the spec. + - ``http``: a TraceFetcher GETs from ``base_url + "/" + trace_id`` using + the run's ``context.headers``. + - ``uploaded``: synthesis-only kind written by ``/api/evaluate`` after a + synchronous UI/multipart upload completes. Records ``trace_count`` and + ``trace_files`` for audit but the trace bytes themselves are not + retained, so an ``uploaded`` run cannot be re-executed by the worker. + """ + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True, extra="allow") + + kind: Literal["inline", "http", "uploaded"] + inline: dict[str, Any] | None = None + base_url: str | None = None + trace_id: str | None = None + trace_format: Literal["jaeger-json", "otlp-json"] | None = None + trace_count: int | None = None + trace_files: list[str] | None = None + + +class RunSpec(BaseModel): + """Validated submission body. Stored verbatim in ``agentevals.run.spec``.""" + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True, extra="allow") + + approach: Literal["trace_replay"] = "trace_replay" + target: TraceTarget + eval_set: dict[str, Any] | None = None + eval_config: dict[str, Any] = Field(default_factory=dict) + sinks: list[dict[str, Any]] = Field(default_factory=list) + context: dict[str, Any] = Field(default_factory=dict) + + +class Run(BaseModel): + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + + run_id: UUID + status: RunStatus + spec: RunSpec + attempt: int = 0 + worker_id: str | None = None + error: str | None = None + summary: dict[str, Any] | None = None + created_at: datetime = Field(default_factory=_now) + started_at: datetime | None = None + finished_at: datetime | None = None + cancel_requested: bool = False + + +class Result(BaseModel): + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + + result_id: str + run_id: UUID + eval_set_item_id: str + eval_set_item_name: str + evaluator_name: str + evaluator_type: Literal["builtin", "code", "remote", "openai_eval"] + status: ResultStatus + score: float | None = None + per_invocation_scores: list[float | None] = Field(default_factory=list) + trace_id: str | None = None + span_id: str | None = None + details: dict[str, Any] = Field(default_factory=dict) + error_text: str | None = None + tokens_used: dict[str, Any] | None = None + latency_ms: int | None = None + created_at: datetime = Field(default_factory=_now) + + @classmethod + def from_metric_result( + cls, + *, + run_id: UUID, + eval_set_item_id: str, + eval_set_item_name: str, + trace_id: str | None, + evaluator_type: Literal["builtin", "code", "remote", "openai_eval"], + metric_result: Any, + ) -> Result: + """Project an in-pipeline MetricResult onto the persisted shape. + + ADK emits ``eval_status`` strings ``PASSED`` / ``FAILED`` / + ``NOT_EVALUATED``; we additionally map presence of ``error`` to + ``errored`` so downstream consumers don't have to special-case + evaluator failures. + """ + if metric_result.error: + status = ResultStatus.ERRORED + else: + raw = (metric_result.eval_status or "NOT_EVALUATED").upper() + status = { + "PASSED": ResultStatus.PASSED, + "FAILED": ResultStatus.FAILED, + }.get(raw, ResultStatus.SKIPPED) + + scores: list[float | None] = list(metric_result.per_invocation_scores or []) + latency_ms = int(metric_result.duration_ms) if metric_result.duration_ms is not None else None + + return cls( + result_id=compute_result_id(run_id, eval_set_item_id, metric_result.metric_name), + run_id=run_id, + eval_set_item_id=eval_set_item_id, + eval_set_item_name=eval_set_item_name, + evaluator_name=metric_result.metric_name, + evaluator_type=evaluator_type, + status=status, + score=metric_result.score, + per_invocation_scores=scores, + trace_id=trace_id, + details=metric_result.details or {}, + error_text=metric_result.error, + latency_ms=latency_ms, + ) diff --git a/src/agentevals/storage/postgres/__init__.py b/src/agentevals/storage/postgres/__init__.py new file mode 100644 index 0000000..3dc47d8 --- /dev/null +++ b/src/agentevals/storage/postgres/__init__.py @@ -0,0 +1,5 @@ +"""Postgres backend (asyncpg, no ORM). + +Hand-written SQL because we lean on PG-specific features (FOR UPDATE SKIP +LOCKED, pg_try_advisory_lock, JSONB, ARRAY) that an ORM would obscure. +""" diff --git a/src/agentevals/storage/postgres/migrations/000001_init.down.sql b/src/agentevals/storage/postgres/migrations/000001_init.down.sql new file mode 100644 index 0000000..131b385 --- /dev/null +++ b/src/agentevals/storage/postgres/migrations/000001_init.down.sql @@ -0,0 +1,5 @@ +-- WARNING: dropping the schema deletes ALL agentevals data: sessions, runs, +-- results, and the evaluator cache. This file is invoked only by +-- ``agentevals migrate down --steps N`` and is not safe to run in production. + +DROP SCHEMA IF EXISTS {schema} CASCADE; diff --git a/src/agentevals/storage/postgres/migrations/000001_init.up.sql b/src/agentevals/storage/postgres/migrations/000001_init.up.sql new file mode 100644 index 0000000..98bcaba --- /dev/null +++ b/src/agentevals/storage/postgres/migrations/000001_init.up.sql @@ -0,0 +1,110 @@ +-- agentevals baseline schema. Immutable once tagged in a release. +-- Schema changes go in a NEW migration file (000002_*.up.sql, etc.). +-- The {schema} placeholder is substituted by the Python migrator at apply time. + +CREATE SCHEMA IF NOT EXISTS {schema}; + +CREATE TABLE IF NOT EXISTS {schema}.session ( + session_id TEXT PRIMARY KEY, + trace_id TEXT NOT NULL, + trace_ids TEXT[] NOT NULL DEFAULT '{{}}', + eval_set_id TEXT, + source TEXT NOT NULL CHECK (source IN ('websocket', 'otlp', 'api')), + is_complete BOOLEAN NOT NULL DEFAULT FALSE, + has_root_span BOOLEAN NOT NULL DEFAULT FALSE, + metadata JSONB NOT NULL DEFAULT '{{}}', + started_at TIMESTAMPTZ NOT NULL DEFAULT now(), + completed_at TIMESTAMPTZ, + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now(), + expires_at TIMESTAMPTZ +); + +CREATE INDEX IF NOT EXISTS session_expires_at_idx + ON {schema}.session (expires_at) + WHERE expires_at IS NOT NULL; + +-- Reserved for future per-span / per-log persistence. Spans and logs stay +-- in-process on StreamingTraceManager in this OSS slice; this table exists +-- so a future migration can populate it without an ALTER on session. +CREATE TABLE IF NOT EXISTS {schema}.session_event ( + event_id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, + session_id TEXT NOT NULL REFERENCES {schema}.session(session_id) ON DELETE CASCADE, + kind TEXT NOT NULL CHECK (kind IN ('span', 'log')), + payload JSONB NOT NULL, + received_at TIMESTAMPTZ NOT NULL DEFAULT now() +); + +CREATE INDEX IF NOT EXISTS session_event_session_id_idx + ON {schema}.session_event (session_id, event_id); + +-- Run state and work queue. claim_next() relies on the run_queue_idx for +-- SELECT FOR UPDATE SKIP LOCKED ordering. +CREATE TABLE IF NOT EXISTS {schema}.run ( + run_id UUID PRIMARY KEY, + status TEXT NOT NULL CHECK (status IN + ('queued', 'running', 'succeeded', 'failed', 'cancelled')), + approach TEXT NOT NULL CHECK (approach IN ('trace_replay', 'agent_invoke')), + spec JSONB NOT NULL, + attempt INT NOT NULL DEFAULT 0, + worker_id TEXT, + claimed_at TIMESTAMPTZ, + lease_expires_at TIMESTAMPTZ, + cancel_requested BOOLEAN NOT NULL DEFAULT FALSE, + error TEXT, + summary JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + started_at TIMESTAMPTZ, + finished_at TIMESTAMPTZ, + CONSTRAINT run_running_has_worker + CHECK (status <> 'running' + OR (worker_id IS NOT NULL + AND claimed_at IS NOT NULL + AND lease_expires_at IS NOT NULL)) +); + +CREATE INDEX IF NOT EXISTS run_queue_idx + ON {schema}.run (status, created_at) + WHERE status IN ('queued', 'running'); + +CREATE INDEX IF NOT EXISTS run_lease_idx + ON {schema}.run (lease_expires_at) + WHERE status = 'running'; + +CREATE TABLE IF NOT EXISTS {schema}.result ( + result_id TEXT PRIMARY KEY, + run_id UUID NOT NULL REFERENCES {schema}.run(run_id) ON DELETE CASCADE, + eval_set_item_id TEXT NOT NULL, + eval_set_item_name TEXT NOT NULL, + evaluator_name TEXT NOT NULL, + evaluator_type TEXT NOT NULL CHECK (evaluator_type IN + ('builtin', 'code', 'remote', 'openai_eval')), + status TEXT NOT NULL CHECK (status IN + ('passed', 'failed', 'errored', 'skipped')), + score DOUBLE PRECISION, + per_invocation_scores DOUBLE PRECISION[] NOT NULL DEFAULT '{{}}', + trace_id TEXT, + span_id TEXT, + details JSONB NOT NULL DEFAULT '{{}}', + error_text TEXT, + tokens_used JSONB, + latency_ms INT, + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + expires_at TIMESTAMPTZ +); + +CREATE INDEX IF NOT EXISTS result_run_id_idx ON {schema}.result (run_id); +CREATE INDEX IF NOT EXISTS result_expires_at_idx ON {schema}.result (expires_at) WHERE expires_at IS NOT NULL; + +-- Reserved for cached evaluator code from external sources (GitHub today, +-- additional sources later). No read/write code in this slice; included here +-- so a future change does not require an ALTER on this table. +CREATE TABLE IF NOT EXISTS {schema}.evaluator_cache ( + source_name TEXT NOT NULL, + evaluator_name TEXT NOT NULL, + ref TEXT NOT NULL, + content BYTEA NOT NULL, + metadata JSONB NOT NULL DEFAULT '{{}}', + fetched_at TIMESTAMPTZ NOT NULL DEFAULT now(), + PRIMARY KEY (source_name, evaluator_name, ref) +); diff --git a/src/agentevals/storage/postgres/migrator.py b/src/agentevals/storage/postgres/migrator.py new file mode 100644 index 0000000..13c207b --- /dev/null +++ b/src/agentevals/storage/postgres/migrator.py @@ -0,0 +1,286 @@ +"""SQL migration runner. + +Applies sequentially numbered migrations under +``src/agentevals/storage/postgres/migrations/``. Holds a Postgres advisory +lock for the duration so multi-replica installs can safely call ``migrate +up`` from any process. The tracking table is golang-migrate compatible +(``schema_migrations`` with ``version`` BIGINT PRIMARY KEY and ``dirty`` +BOOLEAN), so external migration tooling can adopt the same files later +without translation. +""" + +from __future__ import annotations + +import asyncio +import logging +import re +from dataclasses import dataclass +from importlib.resources import files +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import asyncpg + +logger = logging.getLogger(__name__) + +ADVISORY_LOCK_KEY = 7259820376655812345 +"""Fixed int8 used by pg_try_advisory_lock during migration runs. +Chosen at random; collision-free for any sane application.""" + +_FILE_PATTERN = re.compile(r"^(?P\d{6})_(?P[a-z0-9_]+)\.(?Pup|down)\.sql$") + + +@dataclass(frozen=True) +class Migration: + version: int + name: str + up_sql: str + down_sql: str | None + + +def _discover_migrations() -> list[Migration]: + """Read all NNNNNN_name.up.sql / .down.sql pairs from the package. + + importlib.resources resolves correctly inside a wheel, in editable + installs, and from a zipped package. + """ + pkg = files("agentevals.storage.postgres.migrations") + ups: dict[int, tuple[str, str]] = {} + downs: dict[int, str] = {} + + for entry in pkg.iterdir(): + match = _FILE_PATTERN.match(entry.name) + if not match: + continue + version = int(match.group("version")) + name = match.group("name") + sql = entry.read_text(encoding="utf-8") + if match.group("dir") == "up": + ups[version] = (name, sql) + else: + downs[version] = sql + + migrations = [] + for version in sorted(ups): + name, up_sql = ups[version] + migrations.append(Migration(version=version, name=name, up_sql=up_sql, down_sql=downs.get(version))) + return migrations + + +def _apply_schema(sql: str, schema: str) -> str: + """Substitute the {schema} placeholder. Doubled braces in SQL literals + (``'{{}}'``) collapse back to single braces.""" + if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", schema): + raise ValueError(f"invalid schema name '{schema}'; must be a SQL identifier") + return sql.replace("{schema}", schema).replace("{{}}", "{}") + + +@dataclass +class MigrationStatus: + version: int | None + dirty: bool + + +class Migrator: + """Applies and rolls back migrations against a single Postgres database. + + One advisory lock is held for the lifetime of any apply/rollback call so + concurrent migrators (multiple agentevals replicas booting at once) wait + rather than racing. + """ + + def __init__(self, dsn: str, schema: str = "agentevals", lock_timeout_s: int = 60) -> None: + self._dsn = dsn + self._schema = schema + self._lock_timeout_s = lock_timeout_s + + async def _connect(self) -> "asyncpg.Connection": + try: + import asyncpg + except ImportError as exc: + raise ImportError( + "agentevals migrate requires the 'postgres' extra. Install with: uv sync --extra postgres" + ) from exc + return await connect_with_retry(self._dsn, asyncpg) + + async def _acquire_lock(self, conn: "asyncpg.Connection") -> None: + deadline = asyncio.get_event_loop().time() + self._lock_timeout_s + attempt = 0 + while True: + acquired = await conn.fetchval("SELECT pg_try_advisory_lock($1)", ADVISORY_LOCK_KEY) + if acquired: + return + if asyncio.get_event_loop().time() >= deadline: + raise TimeoutError( + f"Could not acquire migration advisory lock within {self._lock_timeout_s}s. " + "Another migration is likely in progress." + ) + attempt += 1 + wait = min(2.0, 0.2 * attempt) + logger.info("Waiting for migration lock (attempt %d, sleeping %.1fs)...", attempt, wait) + await asyncio.sleep(wait) + + async def _release_lock(self, conn: "asyncpg.Connection") -> None: + await conn.execute("SELECT pg_advisory_unlock($1)", ADVISORY_LOCK_KEY) + + async def _ensure_tracking_table(self, conn: "asyncpg.Connection") -> None: + await conn.execute(f'CREATE SCHEMA IF NOT EXISTS "{self._schema}"') + await conn.execute( + f'CREATE TABLE IF NOT EXISTS "{self._schema}".schema_migrations ' + "(version BIGINT NOT NULL PRIMARY KEY, dirty BOOLEAN NOT NULL)" + ) + + async def _read_status(self, conn: "asyncpg.Connection") -> MigrationStatus: + row = await conn.fetchrow(f'SELECT version, dirty FROM "{self._schema}".schema_migrations LIMIT 1') + if row is None: + return MigrationStatus(version=None, dirty=False) + return MigrationStatus(version=int(row["version"]), dirty=bool(row["dirty"])) + + async def _write_status(self, conn: "asyncpg.Connection", version: int | None, dirty: bool) -> None: + await conn.execute(f'DELETE FROM "{self._schema}".schema_migrations') + if version is not None: + await conn.execute( + f'INSERT INTO "{self._schema}".schema_migrations (version, dirty) VALUES ($1, $2)', + version, + dirty, + ) + + async def status(self) -> MigrationStatus: + conn = await self._connect() + try: + await self._ensure_tracking_table(conn) + return await self._read_status(conn) + finally: + await conn.close() + + async def up(self, *, dry_run: bool = False) -> list[int]: + migrations = _discover_migrations() + applied: list[int] = [] + conn = await self._connect() + try: + await self._ensure_tracking_table(conn) + await self._acquire_lock(conn) + try: + status = await self._read_status(conn) + if status.dirty: + raise RuntimeError( + f"schema_migrations is dirty at version {status.version}. " + "Resolve manually, then run: agentevals migrate force " + ) + pending = [m for m in migrations if status.version is None or m.version > status.version] + if not pending: + logger.info("Nothing to apply (current version: %s)", status.version) + return [] + for m in pending: + sql = _apply_schema(m.up_sql, self._schema) + if dry_run: + logger.info("Would apply migration %06d_%s", m.version, m.name) + applied.append(m.version) + continue + logger.info("Applying migration %06d_%s", m.version, m.name) + await self._write_status(conn, m.version, dirty=True) + try: + async with conn.transaction(): + await conn.execute(sql) + await self._write_status(conn, m.version, dirty=False) + except Exception: + logger.exception("Migration %06d_%s failed; schema_migrations left dirty", m.version, m.name) + raise + applied.append(m.version) + finally: + await self._release_lock(conn) + finally: + await conn.close() + return applied + + async def down(self, *, steps: int) -> list[tuple[int, str]]: + if steps < 1: + raise ValueError("steps must be >= 1") + migrations = _discover_migrations() + by_version = {m.version: m for m in migrations} + rolled_back: list[tuple[int, str]] = [] + conn = await self._connect() + try: + await self._ensure_tracking_table(conn) + await self._acquire_lock(conn) + try: + status = await self._read_status(conn) + if status.dirty or status.version is None: + raise RuntimeError( + f"refusing to roll back from dirty/empty state (version={status.version}, dirty={status.dirty})" + ) + applied_versions = sorted((v for v in by_version if v <= status.version), reverse=True) + target_versions = applied_versions[:steps] + for version in target_versions: + m = by_version[version] + if not m.down_sql: + raise RuntimeError(f"migration {version:06d}_{m.name} has no down.sql") + sql = _apply_schema(m.down_sql, self._schema) + logger.warning("Rolling back %06d_%s\n--- SQL ---\n%s\n--- end ---", m.version, m.name, sql) + next_version = max((v for v in by_version if v < version), default=None) + await self._write_status(conn, version, dirty=True) + try: + async with conn.transaction(): + await conn.execute(sql) + await self._write_status(conn, next_version, dirty=False) + except Exception: + logger.exception( + "Down migration %06d_%s failed; schema_migrations left dirty", m.version, m.name + ) + raise + rolled_back.append((m.version, m.name)) + if next_version is None: + break + finally: + await self._release_lock(conn) + finally: + await conn.close() + return rolled_back + + async def force(self, version: int) -> None: + conn = await self._connect() + try: + await self._ensure_tracking_table(conn) + await self._write_status(conn, version, dirty=False) + finally: + await conn.close() + + +def discover_migrations() -> list[Migration]: + """Public alias for the migration discovery helper, used by ``migrate create``.""" + return _discover_migrations() + + +CONNECT_RETRY_DEADLINE_S = 60.0 +"""Total wall-clock budget for the initial Postgres connection. Bundled PG +in Kubernetes typically takes 5-15s to be ready (PVC bind, initdb, listener +bind), so the agentevals lifespan can race the database on a fresh deploy. +Retrying tolerates that gap rather than failing pod startup and relying on +CrashLoopBackOff timing to eventually line up.""" + + +async def connect_with_retry(dsn: str, asyncpg_module) -> "asyncpg.Connection": + """Open a single asyncpg connection, retrying on connection-refused or + server-not-ready errors for up to ``CONNECT_RETRY_DEADLINE_S`` seconds. + + Connection-time errors are tolerated; once a connection has been + established and a query returned, all subsequent failures propagate + normally. + """ + deadline = asyncio.get_event_loop().time() + CONNECT_RETRY_DEADLINE_S + delay = 0.5 + while True: + try: + return await asyncpg_module.connect(dsn) + except (OSError, asyncpg_module.PostgresError) as exc: + now = asyncio.get_event_loop().time() + if now >= deadline: + raise + sleep_for = min(delay, deadline - now) + logger.info( + "Database not ready (%s); retrying in %.1fs", + type(exc).__name__, + sleep_for, + ) + await asyncio.sleep(sleep_for) + delay = min(delay * 2, 5.0) diff --git a/src/agentevals/storage/postgres/pool.py b/src/agentevals/storage/postgres/pool.py new file mode 100644 index 0000000..f1446e0 --- /dev/null +++ b/src/agentevals/storage/postgres/pool.py @@ -0,0 +1,80 @@ +"""asyncpg pool factory. + +asyncpg is imported lazily so the base ``agentevals`` install (without the +``[postgres]`` extra) does not require it. +""" + +from __future__ import annotations + +import asyncio +import logging +from typing import TYPE_CHECKING + +from ..config import StorageSettings + +if TYPE_CHECKING: + import asyncpg + +logger = logging.getLogger(__name__) + + +async def create_pool(settings: StorageSettings) -> "asyncpg.Pool": + """Build an asyncpg pool sized for the worker fan-out plus headroom. + + The pool needs at least one connection per concurrent worker (claim + + heartbeat run on the same connection), one for the API request handlers, + plus a small buffer. + + Pool warmup eagerly opens ``min_size`` connections, which can race with + Postgres readiness on a fresh deploy. We retry on connection-refused so + the lifespan tolerates the gap rather than crashing the pod. + """ + try: + import asyncpg + except ImportError as exc: + raise ImportError( + "AGENTEVALS_STORAGE_BACKEND=postgres requires the 'postgres' extra. " + "Install with: uv sync --extra postgres (or pip install 'agentevals-cli[postgres]')" + ) from exc + + if not settings.database_url: + raise ValueError("AGENTEVALS_DATABASE_URL is required for postgres backend") + + min_size = max(2, settings.max_concurrent_runs) + max_size = settings.max_concurrent_runs * 2 + 4 + + logger.info( + "Creating asyncpg pool (min=%d, max=%d) for schema '%s'", + min_size, + max_size, + settings.schema_name, + ) + + from .migrator import CONNECT_RETRY_DEADLINE_S + + deadline = asyncio.get_event_loop().time() + CONNECT_RETRY_DEADLINE_S + delay = 0.5 + while True: + try: + pool = await asyncpg.create_pool( + dsn=settings.database_url, + min_size=min_size, + max_size=max_size, + command_timeout=60, + ) + break + except (OSError, asyncpg.PostgresError) as exc: + now = asyncio.get_event_loop().time() + if now >= deadline: + raise + sleep_for = min(delay, deadline - now) + logger.info( + "Pool warmup failed (%s); retrying in %.1fs", + type(exc).__name__, + sleep_for, + ) + await asyncio.sleep(sleep_for) + delay = min(delay * 2, 5.0) + if pool is None: + raise RuntimeError("asyncpg.create_pool returned None") + return pool diff --git a/src/agentevals/storage/repos/__init__.py b/src/agentevals/storage/repos/__init__.py new file mode 100644 index 0000000..c267be2 --- /dev/null +++ b/src/agentevals/storage/repos/__init__.py @@ -0,0 +1,90 @@ +"""Repository protocols and the bundle holder used by ``/api/runs``.""" + +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime, timedelta +from typing import Protocol +from uuid import UUID + +from ..models import Result, Run, RunStatus + +if False: # for type checking only — avoids circular import at runtime + from ...streaming.session import TraceSession + + +class SessionRepository(Protocol): + """Tracks streaming TraceSession metadata. + + Spans and logs themselves stay in-process on the StreamingTraceManager in + this OSS slice; only the session lifecycle row is persisted. + """ + + async def get(self, session_id: str) -> "TraceSession | None": ... + async def upsert(self, session: "TraceSession") -> None: ... + async def delete(self, session_id: str) -> None: ... + async def list_all(self) -> "list[TraceSession]": ... + async def find_by_trace_id(self, trace_id: str) -> "TraceSession | None": ... + + +class RunRepository(Protocol): + async def create(self, run: Run) -> Run: + """Insert a new run. Idempotent on ``run_id`` — if a row exists with + the same id, returns the persisted row unchanged. + """ + + async def get(self, run_id: UUID) -> Run | None: ... + async def list( + self, + *, + status: list[RunStatus] | None = None, + limit: int = 100, + before: datetime | None = None, + ) -> list[Run]: ... + async def claim_next(self, *, worker_id: str, lease: timedelta, max_attempts: int) -> Run | None: + """Atomically claim a queued or lease-expired run via SELECT FOR UPDATE + SKIP LOCKED. Returns ``None`` if no work is available. + """ + + async def heartbeat(self, run_id: UUID, worker_id: str, lease: timedelta) -> bool: + """Extend the lease. Returns False if the run was cancelled or lost.""" + + async def update_status( + self, + run_id: UUID, + status: RunStatus, + *, + error: str | None = None, + summary: dict | None = None, + ) -> None: ... + async def cancel(self, run_id: UUID) -> bool: + """Mark cancel_requested=True; the worker observes on next heartbeat.""" + + +class ResultRepository(Protocol): + async def upsert_many(self, run_id: UUID, results: list[Result]) -> None: + """Idempotent bulk insert/update on ``result_id``.""" + + async def list_by_run(self, run_id: UUID) -> list[Result]: ... + async def delete_by_run(self, run_id: UUID) -> None: ... + + +@dataclass +class Repos: + """Bundle of the three repos plus a close hook for the underlying pool.""" + + sessions: SessionRepository + runs: RunRepository + results: ResultRepository + backend: str + + async def close(self) -> None: + pass + + +__all__ = [ + "Repos", + "ResultRepository", + "RunRepository", + "SessionRepository", +] diff --git a/src/agentevals/storage/repos/memory.py b/src/agentevals/storage/repos/memory.py new file mode 100644 index 0000000..2e9790e --- /dev/null +++ b/src/agentevals/storage/repos/memory.py @@ -0,0 +1,183 @@ +"""In-process dict-backed implementations of the repository protocols. + +Used as the default for OSS so ``agentevals run trace.json`` and ``helm +install agentevals`` keep working with no external dependencies. Behavior +matches the pre-existing :class:`StreamingTraceManager.sessions` dict that +this code replaces. +""" + +from __future__ import annotations + +import asyncio +from datetime import datetime, timedelta, timezone +from typing import TYPE_CHECKING +from uuid import UUID + +from ..models import Result, Run, RunStatus +from . import Repos, ResultRepository, RunRepository, SessionRepository + +if TYPE_CHECKING: + from ...streaming.session import TraceSession + + +def _now() -> datetime: + return datetime.now(timezone.utc) + + +class MemorySessionRepository: + def __init__(self) -> None: + self._sessions: dict[str, TraceSession] = {} + self._lock = asyncio.Lock() + + async def get(self, session_id: str) -> TraceSession | None: + async with self._lock: + return self._sessions.get(session_id) + + async def upsert(self, session: TraceSession) -> None: + async with self._lock: + self._sessions[session.session_id] = session + + async def delete(self, session_id: str) -> None: + async with self._lock: + self._sessions.pop(session_id, None) + + async def list_all(self) -> list[TraceSession]: + async with self._lock: + return list(self._sessions.values()) + + async def find_by_trace_id(self, trace_id: str) -> TraceSession | None: + async with self._lock: + for session in self._sessions.values(): + if trace_id in session.trace_ids: + return session + return None + + +class MemoryRunRepository: + def __init__(self) -> None: + self._runs: dict[UUID, Run] = {} + self._lock = asyncio.Lock() + + async def create(self, run: Run) -> Run: + async with self._lock: + existing = self._runs.get(run.run_id) + if existing is not None: + return existing + self._runs[run.run_id] = run + return run + + async def get(self, run_id: UUID) -> Run | None: + async with self._lock: + return self._runs.get(run_id) + + async def list( + self, + *, + status: list[RunStatus] | None = None, + limit: int = 100, + before: datetime | None = None, + ) -> list[Run]: + async with self._lock: + runs = list(self._runs.values()) + runs.sort(key=lambda r: r.created_at, reverse=True) + if status: + runs = [r for r in runs if r.status in status] + if before: + runs = [r for r in runs if r.created_at < before] + return runs[:limit] + + async def claim_next(self, *, worker_id: str, lease: timedelta, max_attempts: int) -> Run | None: + now = _now() + async with self._lock: + candidates = [r for r in self._runs.values() if r.status == RunStatus.QUEUED and r.attempt < max_attempts] + candidates.sort(key=lambda r: r.created_at) + if not candidates: + return None + run = candidates[0] + run.status = RunStatus.RUNNING + run.worker_id = worker_id + run.attempt += 1 + run.started_at = run.started_at or now + return run + + async def heartbeat(self, run_id: UUID, worker_id: str, lease: timedelta) -> bool: + async with self._lock: + run = self._runs.get(run_id) + if run is None or run.worker_id != worker_id: + return False + return not run.cancel_requested + + async def update_status( + self, + run_id: UUID, + status: RunStatus, + *, + error: str | None = None, + summary: dict | None = None, + ) -> None: + async with self._lock: + run = self._runs.get(run_id) + if run is None: + return + run.status = status + if error is not None: + run.error = error + if summary is not None: + run.summary = summary + if status in (RunStatus.SUCCEEDED, RunStatus.FAILED, RunStatus.CANCELLED): + run.finished_at = _now() + + async def cancel(self, run_id: UUID) -> bool: + async with self._lock: + run = self._runs.get(run_id) + if run is None or run.status in (RunStatus.SUCCEEDED, RunStatus.FAILED, RunStatus.CANCELLED): + return False + run.cancel_requested = True + if run.status == RunStatus.QUEUED: + run.status = RunStatus.CANCELLED + run.finished_at = _now() + return True + + +class MemoryResultRepository: + def __init__(self) -> None: + self._results: dict[str, Result] = {} + self._by_run: dict[UUID, list[str]] = {} + self._lock = asyncio.Lock() + + async def upsert_many(self, run_id: UUID, results: list[Result]) -> None: + async with self._lock: + for r in results: + self._results[r.result_id] = r + ids = self._by_run.setdefault(run_id, []) + if r.result_id not in ids: + ids.append(r.result_id) + + async def list_by_run(self, run_id: UUID) -> list[Result]: + async with self._lock: + ids = self._by_run.get(run_id, []) + return [self._results[i] for i in ids if i in self._results] + + async def delete_by_run(self, run_id: UUID) -> None: + async with self._lock: + for rid in self._by_run.pop(run_id, []): + self._results.pop(rid, None) + + +class MemoryRepos(Repos): + @classmethod + def create(cls) -> "MemoryRepos": + return cls( + sessions=MemorySessionRepository(), + runs=MemoryRunRepository(), + results=MemoryResultRepository(), + backend="memory", + ) + + +__all__ = [ + "MemoryRepos", + "MemoryResultRepository", + "MemoryRunRepository", + "MemorySessionRepository", +] diff --git a/src/agentevals/storage/repos/postgres.py b/src/agentevals/storage/repos/postgres.py new file mode 100644 index 0000000..dad4ebb --- /dev/null +++ b/src/agentevals/storage/repos/postgres.py @@ -0,0 +1,389 @@ +"""asyncpg-backed repository implementations. + +Plain SQL, no ORM. The connection pool is created in +``storage.postgres.pool.create_pool`` and lives on :class:`PostgresRepos`; +each method acquires a connection from the pool for the duration of a single +query or transaction. +""" + +from __future__ import annotations + +import json +import logging +from datetime import datetime, timedelta, timezone +from typing import TYPE_CHECKING +from uuid import UUID + +from ..models import Result, ResultStatus, Run, RunSpec, RunStatus +from . import Repos + +if TYPE_CHECKING: + import asyncpg + + from ...streaming.session import TraceSession + +logger = logging.getLogger(__name__) + + +def _now() -> datetime: + return datetime.now(timezone.utc) + + +def _row_to_session(row: "asyncpg.Record") -> "TraceSession": + from ...streaming.session import TraceSession + + return TraceSession( + session_id=row["session_id"], + trace_id=row["trace_id"], + eval_set_id=row["eval_set_id"], + started_at=row["started_at"], + is_complete=row["is_complete"], + completed_at=row["completed_at"], + metadata=dict(row["metadata"]) if row["metadata"] else {}, + source=row["source"], + has_root_span=row["has_root_span"], + trace_ids=set(row["trace_ids"] or []), + ) + + +def _row_to_run(row: "asyncpg.Record") -> Run: + spec_json = row["spec"] + spec_dict = json.loads(spec_json) if isinstance(spec_json, str) else spec_json + summary_json = row["summary"] + summary = json.loads(summary_json) if isinstance(summary_json, str) else summary_json + return Run( + run_id=row["run_id"], + status=RunStatus(row["status"]), + spec=RunSpec.model_validate(spec_dict), + attempt=row["attempt"], + worker_id=row["worker_id"], + error=row["error"], + summary=summary, + created_at=row["created_at"], + started_at=row["started_at"], + finished_at=row["finished_at"], + cancel_requested=row["cancel_requested"], + ) + + +def _row_to_result(row: "asyncpg.Record") -> Result: + details_json = row["details"] + details = json.loads(details_json) if isinstance(details_json, str) else details_json + tokens_json = row["tokens_used"] + tokens = json.loads(tokens_json) if isinstance(tokens_json, str) else tokens_json + return Result( + result_id=row["result_id"], + run_id=row["run_id"], + eval_set_item_id=row["eval_set_item_id"], + eval_set_item_name=row["eval_set_item_name"], + evaluator_name=row["evaluator_name"], + evaluator_type=row["evaluator_type"], + status=ResultStatus(row["status"]), + score=row["score"], + per_invocation_scores=list(row["per_invocation_scores"] or []), + trace_id=row["trace_id"], + span_id=row["span_id"], + details=dict(details) if details else {}, + error_text=row["error_text"], + tokens_used=dict(tokens) if tokens else None, + latency_ms=row["latency_ms"], + created_at=row["created_at"], + ) + + +class PostgresSessionRepository: + def __init__(self, pool: "asyncpg.Pool", schema: str) -> None: + self._pool = pool + self._schema = schema + + @property + def _t(self) -> str: + return f'"{self._schema}".session' + + async def get(self, session_id: str) -> "TraceSession | None": + row = await self._pool.fetchrow(f"SELECT * FROM {self._t} WHERE session_id = $1", session_id) + return _row_to_session(row) if row else None + + async def upsert(self, session: "TraceSession") -> None: + await self._pool.execute( + f""" + INSERT INTO {self._t} + (session_id, trace_id, trace_ids, eval_set_id, source, is_complete, + has_root_span, metadata, started_at, completed_at, updated_at) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8::jsonb, $9, $10, now()) + ON CONFLICT (session_id) DO UPDATE SET + trace_id = EXCLUDED.trace_id, + trace_ids = EXCLUDED.trace_ids, + eval_set_id = EXCLUDED.eval_set_id, + source = EXCLUDED.source, + is_complete = EXCLUDED.is_complete, + has_root_span= EXCLUDED.has_root_span, + metadata = EXCLUDED.metadata, + started_at = EXCLUDED.started_at, + completed_at = EXCLUDED.completed_at, + updated_at = now() + """, + session.session_id, + session.trace_id, + sorted(session.trace_ids), + session.eval_set_id, + session.source, + session.is_complete, + session.has_root_span, + json.dumps(session.metadata or {}), + session.started_at, + session.completed_at, + ) + + async def delete(self, session_id: str) -> None: + await self._pool.execute(f"DELETE FROM {self._t} WHERE session_id = $1", session_id) + + async def list_all(self) -> "list[TraceSession]": + rows = await self._pool.fetch(f"SELECT * FROM {self._t} ORDER BY started_at DESC") + return [_row_to_session(r) for r in rows] + + async def find_by_trace_id(self, trace_id: str) -> "TraceSession | None": + row = await self._pool.fetchrow( + f"SELECT * FROM {self._t} WHERE $1 = ANY(trace_ids) OR trace_id = $1 ORDER BY started_at DESC LIMIT 1", + trace_id, + ) + return _row_to_session(row) if row else None + + +class PostgresRunRepository: + def __init__(self, pool: "asyncpg.Pool", schema: str) -> None: + self._pool = pool + self._schema = schema + + @property + def _t(self) -> str: + return f'"{self._schema}".run' + + async def create(self, run: Run) -> Run: + spec_json = run.spec.model_dump_json(by_alias=False) + row = await self._pool.fetchrow( + f""" + INSERT INTO {self._t} + (run_id, status, approach, spec, attempt, created_at) + VALUES ($1, $2, $3, $4::jsonb, 0, $5) + ON CONFLICT (run_id) DO NOTHING + RETURNING * + """, + run.run_id, + run.status.value, + run.spec.approach, + spec_json, + run.created_at, + ) + if row is not None: + return _row_to_run(row) + existing = await self.get(run.run_id) + if existing is None: + raise RuntimeError(f"run {run.run_id} disappeared between INSERT ... ON CONFLICT and SELECT") + return existing + + async def get(self, run_id: UUID) -> Run | None: + row = await self._pool.fetchrow(f"SELECT * FROM {self._t} WHERE run_id = $1", run_id) + return _row_to_run(row) if row else None + + async def list( + self, + *, + status: list[RunStatus] | None = None, + limit: int = 100, + before: datetime | None = None, + ) -> list[Run]: + clauses: list[str] = [] + args: list[object] = [] + if status: + args.append([s.value for s in status]) + clauses.append(f"status = ANY(${len(args)})") + if before: + args.append(before) + clauses.append(f"created_at < ${len(args)}") + where = ("WHERE " + " AND ".join(clauses)) if clauses else "" + args.append(limit) + rows = await self._pool.fetch( + f"SELECT * FROM {self._t} {where} ORDER BY created_at DESC LIMIT ${len(args)}", + *args, + ) + return [_row_to_run(r) for r in rows] + + async def claim_next(self, *, worker_id: str, lease: timedelta, max_attempts: int) -> Run | None: + lease_seconds = int(lease.total_seconds()) + async with self._pool.acquire() as conn: + async with conn.transaction(): + row = await conn.fetchrow( + f""" + UPDATE {self._t} + SET status = 'running', + worker_id = $1, + claimed_at = now(), + lease_expires_at = now() + make_interval(secs => $2), + started_at = COALESCE(started_at, now()), + attempt = attempt + 1 + WHERE run_id = ( + SELECT run_id FROM {self._t} + WHERE attempt < $3 + AND cancel_requested = FALSE + AND (status = 'queued' + OR (status = 'running' AND lease_expires_at < now())) + ORDER BY created_at + LIMIT 1 + FOR UPDATE SKIP LOCKED + ) + RETURNING * + """, + worker_id, + lease_seconds, + max_attempts, + ) + return _row_to_run(row) if row else None + + async def heartbeat(self, run_id: UUID, worker_id: str, lease: timedelta) -> bool: + lease_seconds = int(lease.total_seconds()) + row = await self._pool.fetchrow( + f""" + UPDATE {self._t} + SET lease_expires_at = now() + make_interval(secs => $1) + WHERE run_id = $2 + AND worker_id = $3 + AND status = 'running' + AND cancel_requested = FALSE + RETURNING run_id + """, + lease_seconds, + run_id, + worker_id, + ) + return row is not None + + async def update_status( + self, + run_id: UUID, + status: RunStatus, + *, + error: str | None = None, + summary: dict | None = None, + ) -> None: + terminal = status in (RunStatus.SUCCEEDED, RunStatus.FAILED, RunStatus.CANCELLED) + await self._pool.execute( + f""" + UPDATE {self._t} + SET status = $1, + error = COALESCE($2, error), + summary = COALESCE($3::jsonb, summary), + finished_at = CASE WHEN $4 THEN now() ELSE finished_at END, + worker_id = CASE WHEN $4 THEN NULL ELSE worker_id END, + lease_expires_at = CASE WHEN $4 THEN NULL ELSE lease_expires_at END, + claimed_at = CASE WHEN $4 THEN NULL ELSE claimed_at END + WHERE run_id = $5 + """, + status.value, + error, + json.dumps(summary) if summary is not None else None, + terminal, + run_id, + ) + + async def cancel(self, run_id: UUID) -> bool: + row = await self._pool.fetchrow( + f""" + UPDATE {self._t} + SET cancel_requested = TRUE, + status = CASE WHEN status = 'queued' THEN 'cancelled' ELSE status END, + finished_at = CASE WHEN status = 'queued' THEN now() ELSE finished_at END + WHERE run_id = $1 + AND status IN ('queued', 'running') + RETURNING run_id + """, + run_id, + ) + return row is not None + + +class PostgresResultRepository: + def __init__(self, pool: "asyncpg.Pool", schema: str) -> None: + self._pool = pool + self._schema = schema + + @property + def _t(self) -> str: + return f'"{self._schema}".result' + + async def upsert_many(self, run_id: UUID, results: list[Result]) -> None: + if not results: + return + rows = [ + ( + r.result_id, + r.run_id, + r.eval_set_item_id, + r.eval_set_item_name, + r.evaluator_name, + r.evaluator_type, + r.status.value, + r.score, + [s for s in r.per_invocation_scores if s is not None], + r.trace_id, + r.span_id, + json.dumps(r.details or {}), + r.error_text, + json.dumps(r.tokens_used) if r.tokens_used is not None else None, + r.latency_ms, + r.created_at, + ) + for r in results + ] + async with self._pool.acquire() as conn: + async with conn.transaction(): + await conn.executemany( + f""" + INSERT INTO {self._t} + (result_id, run_id, eval_set_item_id, eval_set_item_name, + evaluator_name, evaluator_type, status, score, + per_invocation_scores, trace_id, span_id, details, + error_text, tokens_used, latency_ms, created_at) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12::jsonb, + $13, $14::jsonb, $15, $16) + ON CONFLICT (result_id) DO UPDATE SET + status = EXCLUDED.status, + score = EXCLUDED.score, + per_invocation_scores = EXCLUDED.per_invocation_scores, + details = EXCLUDED.details, + error_text = EXCLUDED.error_text, + tokens_used = EXCLUDED.tokens_used, + latency_ms = EXCLUDED.latency_ms + """, + rows, + ) + + async def list_by_run(self, run_id: UUID) -> list[Result]: + rows = await self._pool.fetch( + f"SELECT * FROM {self._t} WHERE run_id = $1 ORDER BY created_at", + run_id, + ) + return [_row_to_result(r) for r in rows] + + async def delete_by_run(self, run_id: UUID) -> None: + await self._pool.execute(f"DELETE FROM {self._t} WHERE run_id = $1", run_id) + + +class PostgresRepos(Repos): + """Repos backed by a single asyncpg pool. ``close()`` shuts the pool down.""" + + def __init__(self, *, pool: "asyncpg.Pool", schema: str) -> None: + super().__init__( + sessions=PostgresSessionRepository(pool, schema), + runs=PostgresRunRepository(pool, schema), + results=PostgresResultRepository(pool, schema), + backend="postgres", + ) + self._pool = pool + + @classmethod + async def create(cls, *, pool: "asyncpg.Pool", schema: str) -> "PostgresRepos": + return cls(pool=pool, schema=schema) + + async def close(self) -> None: + await self._pool.close() diff --git a/tests/api/__init__.py b/tests/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/api/test_evaluate_persistence.py b/tests/api/test_evaluate_persistence.py new file mode 100644 index 0000000..d4bd690 --- /dev/null +++ b/tests/api/test_evaluate_persistence.py @@ -0,0 +1,173 @@ +"""Option A: /api/evaluate variants persist when run_service is configured. + +These tests stub a memory-backed RunService onto app.state so we can drive +the persistence path without standing up a real Postgres. The lifespan +itself only configures run_service when AGENTEVALS_STORAGE_BACKEND=postgres, +so production behavior matches: memory backend leaves runId=null and never +writes; postgres backend persists. +""" + +from __future__ import annotations + +import asyncio +import json +from pathlib import Path + +import pytest +from fastapi.testclient import TestClient + +from agentevals.api.app import create_app +from agentevals.run.service import RunService +from agentevals.storage.repos.memory import MemoryRepos + +REPO_ROOT = Path(__file__).resolve().parents[2] +SAMPLE_TRACE = REPO_ROOT / "samples" / "helm.json" + + +def _has_sample() -> bool: + return SAMPLE_TRACE.exists() + + +@pytest.fixture +def app_no_runs(): + """No run_service injected, so /api/evaluate runs but does not persist.""" + return create_app() + + +@pytest.fixture +def app_with_runs(): + """Memory-backed run_service simulates the postgres-enabled deployment.""" + repos = MemoryRepos.create() + app = create_app() + app.state.run_service = RunService(repos.runs, repos.results) + return app, repos + + +@pytest.mark.skipif(not _has_sample(), reason="samples/helm.json missing") +class TestEvaluateMultipartSync: + def test_no_run_id_in_response_when_run_service_unset(self, app_no_runs): + with TestClient(app_no_runs) as client: + with SAMPLE_TRACE.open("rb") as f: + r = client.post( + "/api/evaluate", + files={"trace_files": ("helm.json", f, "application/json")}, + data={"config": '{"metrics": ["tool_trajectory_avg_score"]}'}, + ) + assert r.status_code == 200 + assert r.json()["data"].get("runId") is None + + def test_run_persisted_when_run_service_set(self, app_with_runs): + app, repos = app_with_runs + with TestClient(app) as client: + with SAMPLE_TRACE.open("rb") as f: + r = client.post( + "/api/evaluate", + files={"trace_files": ("helm.json", f, "application/json")}, + data={"config": '{"metrics": ["tool_trajectory_avg_score"]}'}, + ) + assert r.status_code == 200 + run_id = r.json()["data"]["runId"] + assert run_id is not None + runs = asyncio.run(repos.runs.list()) + assert len(runs) == 1 + run = runs[0] + assert str(run.run_id) == run_id + # Status is succeeded because no top-level errors fired even though + # the metric_result inside may have errored (no eval_set provided). + assert run.status.value in ("succeeded", "failed") + # The "uploaded" target kind captures audit metadata about the upload + assert run.spec.target.kind == "uploaded" + assert run.spec.target.trace_files == ["helm.json"] + assert run.spec.target.trace_count == 1 + + def test_results_persisted_alongside_run(self, app_with_runs): + app, repos = app_with_runs + with TestClient(app) as client: + with SAMPLE_TRACE.open("rb") as f: + r = client.post( + "/api/evaluate", + files={"trace_files": ("helm.json", f, "application/json")}, + data={"config": '{"metrics": ["tool_trajectory_avg_score"]}'}, + ) + run_id = r.json()["data"]["runId"] + results = asyncio.run(repos.results.list_by_run(_uuid(run_id))) + assert len(results) >= 1 + for res in results: + assert res.evaluator_type in ("builtin", "code", "remote", "openai_eval") + assert res.run_id == _uuid(run_id) + + def test_each_call_creates_distinct_run(self, app_with_runs): + """Multiple UI uploads accumulate in run history; each gets its own + Run row. This is the core OSS user value of Option A.""" + app, repos = app_with_runs + with TestClient(app) as client: + for _ in range(3): + with SAMPLE_TRACE.open("rb") as f: + client.post( + "/api/evaluate", + files={"trace_files": ("helm.json", f, "application/json")}, + data={"config": '{"metrics": ["tool_trajectory_avg_score"]}'}, + ) + runs = asyncio.run(repos.runs.list()) + assert len(runs) == 3 + assert len({r.run_id for r in runs}) == 3 + + def test_persistence_failure_does_not_break_response(self, app_with_runs, monkeypatch): + """The eval result must reach the caller even if persistence fails; + history is best-effort, the eval contract is not.""" + app, repos = app_with_runs + + async def boom(*args, **kwargs): + raise RuntimeError("simulated persistence outage") + + monkeypatch.setattr(app.state.run_service, "record_completed_eval", boom) + with TestClient(app) as client: + with SAMPLE_TRACE.open("rb") as f: + r = client.post( + "/api/evaluate", + files={"trace_files": ("helm.json", f, "application/json")}, + data={"config": '{"metrics": ["tool_trajectory_avg_score"]}'}, + ) + assert r.status_code == 200 + assert r.json()["data"].get("runId") is None + + +@pytest.mark.skipif(not _has_sample(), reason="samples/helm.json missing") +class TestEvaluateSseStream: + def test_done_event_includes_run_id_when_persisted(self, app_with_runs): + app, _repos = app_with_runs + with TestClient(app) as client: + with SAMPLE_TRACE.open("rb") as f: + with client.stream( + "POST", + "/api/evaluate/stream", + files={"trace_files": ("helm.json", f, "application/json")}, + data={"config": '{"metrics": ["tool_trajectory_avg_score"]}'}, + ) as resp: + body = b"".join(resp.iter_bytes()).decode() + # The done event payload is JSON in the last `data:` block. + done_payload = _last_done_payload(body) + assert done_payload is not None + assert done_payload.get("result", {}).get("runId") is not None + + +def _last_done_payload(sse_text: str) -> dict | None: + """Pick the SSE event whose JSON carries ``done: true`` (the SSEDoneEvent + shape from api/models.py — ``{"done": true, "result": {...}}``).""" + last = None + for line in sse_text.splitlines(): + if not line.startswith("data: "): + continue + try: + payload = json.loads(line[len("data: ") :]) + except json.JSONDecodeError: + continue + if payload.get("done") is True: + last = payload + return last + + +def _uuid(value): + from uuid import UUID + + return UUID(value) diff --git a/tests/api/test_runs_routes.py b/tests/api/test_runs_routes.py new file mode 100644 index 0000000..6153006 --- /dev/null +++ b/tests/api/test_runs_routes.py @@ -0,0 +1,185 @@ +"""HTTP-level tests for /api/runs endpoints.""" + +from __future__ import annotations + +import json +from uuid import uuid4 + +import pytest +from fastapi.testclient import TestClient + +from agentevals.api.app import create_app +from agentevals.run.service import RunService +from agentevals.storage.repos.memory import MemoryRepos + + +@pytest.fixture +def memory_app(monkeypatch): + """App with the storage env unset; backend defaults to memory and + /api/runs handlers should return 503 with a configuration hint.""" + for var in ("AGENTEVALS_STORAGE_BACKEND", "AGENTEVALS_DATABASE_URL"): + monkeypatch.delenv(var, raising=False) + return create_app() + + +@pytest.fixture +def stubbed_app(memory_app): + """App that has a memory-backed RunService injected onto app.state, so + we can exercise /api/runs handler logic without standing up a real PG.""" + repos = MemoryRepos.create() + memory_app.state.run_service = RunService(repos.runs, repos.results) + return memory_app, repos + + +class TestMemoryBackendReturns503: + def test_get_runs(self, memory_app): + with TestClient(memory_app) as client: + r = client.get("/api/runs") + assert r.status_code == 503 + assert "AGENTEVALS_STORAGE_BACKEND=postgres" in r.json()["detail"] + + def test_post_run(self, memory_app): + with TestClient(memory_app) as client: + r = client.post( + "/api/runs", + json={"spec": {"approach": "trace_replay", "target": {"kind": "inline", "inline": {}}}}, + ) + assert r.status_code == 503 + + def test_get_run_by_id(self, memory_app): + with TestClient(memory_app) as client: + r = client.get(f"/api/runs/{uuid4()}") + assert r.status_code == 503 + + def test_get_run_results(self, memory_app): + with TestClient(memory_app) as client: + r = client.get(f"/api/runs/{uuid4()}/results") + assert r.status_code == 503 + + def test_cancel_run(self, memory_app): + with TestClient(memory_app) as client: + r = client.post(f"/api/runs/{uuid4()}/cancel") + assert r.status_code == 503 + + def test_health_endpoint_unaffected(self, memory_app): + with TestClient(memory_app) as client: + r = client.get("/api/health") + assert r.status_code == 200 + + +class TestSubmitRun: + def _payload(self, *, marker="x"): + return { + "spec": { + "approach": "trace_replay", + "target": {"kind": "inline", "inline": {"m": marker}}, + } + } + + def test_submit_returns_202(self, stubbed_app): + app, _ = stubbed_app + with TestClient(app) as client: + r = client.post("/api/runs", json=self._payload()) + assert r.status_code == 202 + body = r.json() + assert body["data"]["status"] == "queued" + assert body["data"]["runId"] + + def test_submit_with_explicit_id(self, stubbed_app): + app, _ = stubbed_app + run_id = "11111111-1111-1111-1111-111111111111" + payload = {**self._payload(), "runId": run_id} + with TestClient(app) as client: + r = client.post("/api/runs", json=payload) + assert r.status_code == 202 + assert r.json()["data"]["runId"] == run_id + + def test_idempotent_resubmit_same_spec(self, stubbed_app): + app, _ = stubbed_app + run_id = "22222222-2222-2222-2222-222222222222" + payload = {**self._payload(marker="same"), "runId": run_id} + with TestClient(app) as client: + r1 = client.post("/api/runs", json=payload) + r2 = client.post("/api/runs", json=payload) + assert r1.status_code == 202 + assert r2.status_code == 202 + assert r1.json()["data"]["runId"] == r2.json()["data"]["runId"] + + def test_resubmit_with_different_spec_returns_409(self, stubbed_app): + app, _ = stubbed_app + run_id = "33333333-3333-3333-3333-333333333333" + with TestClient(app) as client: + r1 = client.post("/api/runs", json={**self._payload(marker="A"), "runId": run_id}) + r2 = client.post("/api/runs", json={**self._payload(marker="B"), "runId": run_id}) + assert r1.status_code == 202 + assert r2.status_code == 409 + body = r2.json() + assert "already exists" in body["detail"]["message"] + assert body["detail"]["persisted"]["runId"] == run_id + + def test_invalid_target_kind_rejected(self, stubbed_app): + app, _ = stubbed_app + with TestClient(app) as client: + r = client.post( + "/api/runs", + json={"spec": {"approach": "trace_replay", "target": {"kind": "not-a-kind"}}}, + ) + assert r.status_code == 422 + + +class TestGetAndListRuns: + def test_unknown_run_id_returns_404(self, stubbed_app): + app, _ = stubbed_app + with TestClient(app) as client: + r = client.get(f"/api/runs/{uuid4()}") + assert r.status_code == 404 + + def test_list_empty_then_after_submit(self, stubbed_app): + app, _ = stubbed_app + with TestClient(app) as client: + r1 = client.get("/api/runs") + assert r1.json()["data"] == [] + client.post( + "/api/runs", + json={"spec": {"approach": "trace_replay", "target": {"kind": "inline", "inline": {}}}}, + ) + r2 = client.get("/api/runs") + assert len(r2.json()["data"]) == 1 + + def test_list_status_filter(self, stubbed_app): + app, repos = stubbed_app + with TestClient(app) as client: + client.post( + "/api/runs", + json={"spec": {"approach": "trace_replay", "target": {"kind": "inline", "inline": {}}}}, + ) + r = client.get("/api/runs?status=queued") + assert len(r.json()["data"]) == 1 + r = client.get("/api/runs?status=succeeded") + assert r.json()["data"] == [] + + +class TestCancelRun: + def test_cancel_unknown_run_404(self, stubbed_app): + app, _ = stubbed_app + with TestClient(app) as client: + r = client.post(f"/api/runs/{uuid4()}/cancel") + assert r.status_code == 404 + + def test_cancel_queued_run_marks_cancelled(self, stubbed_app): + app, _ = stubbed_app + with TestClient(app) as client: + sub = client.post( + "/api/runs", + json={"spec": {"approach": "trace_replay", "target": {"kind": "inline", "inline": {}}}}, + ) + run_id = sub.json()["data"]["runId"] + r = client.post(f"/api/runs/{run_id}/cancel") + assert r.status_code == 200 + assert r.json()["data"]["status"] == "cancelled" + + def test_get_run_results_for_unknown_run_404(self, stubbed_app): + app, _ = stubbed_app + with TestClient(app) as client: + r = client.get(f"/api/runs/{uuid4()}/results") + assert r.status_code == 404 diff --git a/tests/run/__init__.py b/tests/run/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/run/test_fetcher.py b/tests/run/test_fetcher.py new file mode 100644 index 0000000..833975f --- /dev/null +++ b/tests/run/test_fetcher.py @@ -0,0 +1,79 @@ +"""Trace fetcher dispatch + InlineTraceFetcher behavior.""" + +from __future__ import annotations + +import json + +import pytest + +from agentevals.run.fetcher import HttpTraceFetcher, InlineTraceFetcher, resolve_fetcher +from agentevals.storage.models import TraceTarget + + +class TestResolveFetcher: + def test_inline_returns_inline_fetcher(self): + f = resolve_fetcher(TraceTarget(kind="inline", inline={})) + assert isinstance(f, InlineTraceFetcher) + + def test_http_returns_http_fetcher(self): + f = resolve_fetcher(TraceTarget(kind="http", base_url="https://x", trace_id="abc")) + assert isinstance(f, HttpTraceFetcher) + + def test_uploaded_rejected_with_clear_error(self): + """Uploaded targets cannot be re-executed by the worker; they only + record audit metadata for /api/evaluate calls. resolve_fetcher must + raise rather than silently returning None or a fallback fetcher.""" + with pytest.raises(ValueError, match="cannot be re-executed"): + resolve_fetcher(TraceTarget(kind="uploaded")) + + +class TestInlineTraceFetcher: + async def test_loads_jaeger_format(self, tmp_path): + sample = { + "data": [ + { + "traceID": "1234", + "spans": [ + { + "traceID": "1234", + "spanID": "abcd", + "operationName": "op", + "startTime": 1000, + "duration": 100, + "tags": [], + "logs": [], + "references": [], + "processID": "p1", + } + ], + "processes": {"p1": {"serviceName": "svc"}}, + } + ] + } + fetcher = InlineTraceFetcher() + traces = await fetcher.fetch( + TraceTarget(kind="inline", inline=sample), + context={}, + ) + assert len(traces) >= 1 + + async def test_missing_inline_raises(self): + fetcher = InlineTraceFetcher() + with pytest.raises(ValueError, match="target.inline"): + await fetcher.fetch(TraceTarget(kind="inline"), context={}) + + +class TestHttpTraceFetcher: + """HttpTraceFetcher hits the network; we test the validation path that + runs before any HTTP traffic. End-to-end HTTP behavior is covered by + the run-flow integration test.""" + + async def test_missing_base_url_raises(self): + fetcher = HttpTraceFetcher() + with pytest.raises(ValueError, match="base_url"): + await fetcher.fetch(TraceTarget(kind="http", trace_id="abc"), context={}) + + async def test_missing_trace_id_raises(self): + fetcher = HttpTraceFetcher() + with pytest.raises(ValueError, match="base_url"): + await fetcher.fetch(TraceTarget(kind="http"), context={}) diff --git a/tests/run/test_result_builder.py b/tests/run/test_result_builder.py new file mode 100644 index 0000000..19b46c2 --- /dev/null +++ b/tests/run/test_result_builder.py @@ -0,0 +1,94 @@ +"""Pure-function tests for build_results / summarize_run_result / classify_evaluator.""" + +from __future__ import annotations + +from uuid import UUID, uuid4 + +from agentevals.config import BuiltinMetricDef, CodeEvaluatorDef, EvalParams +from agentevals.run.result_builder import build_results, classify_evaluator, summarize_run_result +from agentevals.runner import MetricResult, RunResult, TraceResult +from agentevals.storage.models import ResultStatus + + +def _params(custom_evaluators=None) -> EvalParams: + return EvalParams(metrics=["m_builtin"], custom_evaluators=custom_evaluators or []) + + +def _trace_result(*metrics) -> TraceResult: + return TraceResult(trace_id="trace-1", num_invocations=1, metric_results=list(metrics)) + + +def _mr(name="m_builtin", **kw): + kw.setdefault("eval_status", "PASSED") + return MetricResult(metric_name=name, **kw) + + +class TestClassifyEvaluator: + def test_unknown_falls_back_to_builtin(self): + assert classify_evaluator("unknown", _params()) == "builtin" + + def test_custom_code_classified_correctly(self): + params = _params(custom_evaluators=[CodeEvaluatorDef(name="my_code", path="./e.py")]) + assert classify_evaluator("my_code", params) == "code" + + def test_builtin_in_metrics_list(self): + """Even when explicitly listed in params.metrics, the absence of a + matching custom_evaluators entry defaults to 'builtin'. This is + intentional: the persisted result row needs a stable type label and + custom evaluators are the only ones we can disambiguate by name.""" + assert classify_evaluator("m_builtin", _params()) == "builtin" + + +class TestBuildResults: + def test_one_metric_per_trace_yields_one_result(self): + run_id = uuid4() + rr = RunResult(trace_results=[_trace_result(_mr())]) + results = build_results(run_id, _params(), rr) + assert len(results) == 1 + assert results[0].run_id == run_id + assert results[0].evaluator_name == "m_builtin" + + def test_multiple_metrics_flatten(self): + rr = RunResult( + trace_results=[ + _trace_result(_mr(name="a"), _mr(name="b"), _mr(name="c")), + _trace_result(_mr(name="a")), + ] + ) + results = build_results(uuid4(), _params(), rr) + assert len(results) == 4 + names = sorted(r.evaluator_name for r in results) + assert names == ["a", "a", "b", "c"] + + def test_eval_set_item_id_defaults_to_trace_id(self): + """OSS scope: no per-eval-case id extraction. Trace id is the stable + identifier for both eval_set_item_id and eval_set_item_name. Test + locks this so future changes are deliberate.""" + rr = RunResult(trace_results=[_trace_result(_mr())]) + result = build_results(uuid4(), _params(), rr)[0] + assert result.eval_set_item_id == "trace-1" + assert result.eval_set_item_name == "trace-1" + assert result.trace_id == "trace-1" + + +class TestSummarizeRunResult: + def test_counts_pass_fail_skip_error(self): + rr = RunResult( + trace_results=[ + _trace_result( + _mr(eval_status="PASSED"), + _mr(eval_status="FAILED"), + _mr(eval_status="NOT_EVALUATED"), + _mr(error="boom"), + ) + ] + ) + summary = summarize_run_result(rr) + assert summary["result_counts"] == {"passed": 1, "failed": 1, "skipped": 1, "errored": 1} + assert summary["trace_count"] == 1 + + def test_propagates_errors_and_perf(self): + rr = RunResult(errors=["loader failure"], performance_metrics={"p50": 100}) + summary = summarize_run_result(rr) + assert summary["errors"] == ["loader failure"] + assert summary["performance_metrics"] == {"p50": 100} diff --git a/tests/run/test_service.py b/tests/run/test_service.py new file mode 100644 index 0000000..e5effa8 --- /dev/null +++ b/tests/run/test_service.py @@ -0,0 +1,155 @@ +"""RunService unit tests against memory repos.""" + +from __future__ import annotations + +from uuid import uuid4 + +import pytest + +from agentevals.config import EvalParams +from agentevals.run.service import RunService, RunSubmitConflict +from agentevals.runner import MetricResult, RunResult, TraceResult +from agentevals.storage.models import RunSpec, RunStatus, TraceTarget +from agentevals.storage.repos.memory import MemoryRepos + + +def _spec(*, marker: str = "default") -> RunSpec: + return RunSpec( + approach="trace_replay", + target=TraceTarget(kind="inline", inline={"marker": marker}), + ) + + +@pytest.fixture +def service(): + repos = MemoryRepos.create() + return RunService(repos.runs, repos.results), repos + + +class TestRunServiceSubmit: + async def test_first_submit_creates_run(self, service): + svc, _ = service + run = await svc.submit(run_id=None, spec=_spec()) + assert run.run_id is not None + assert run.status == RunStatus.QUEUED + + async def test_resubmit_with_same_id_and_spec_idempotent(self, service): + svc, _ = service + run = await svc.submit(run_id=None, spec=_spec()) + again = await svc.submit(run_id=run.run_id, spec=_spec()) + assert again.run_id == run.run_id + + async def test_resubmit_with_different_spec_raises_conflict(self, service): + """409 path: re-submitting an existing run_id with a different spec + must NOT overwrite the persisted row, and must surface the persisted + spec to the caller for reconciliation.""" + svc, _ = service + run = await svc.submit(run_id=None, spec=_spec(marker="A")) + with pytest.raises(RunSubmitConflict) as excinfo: + await svc.submit(run_id=run.run_id, spec=_spec(marker="B")) + # The persisted spec attached to the exception should be the original + assert excinfo.value.persisted.spec.target.inline == {"marker": "A"} + + async def test_explicit_run_id_honored(self, service): + svc, _ = service + run_id = uuid4() + run = await svc.submit(run_id=run_id, spec=_spec()) + assert run.run_id == run_id + + +class TestRunServiceQueries: + async def test_get_returns_none_for_unknown(self, service): + svc, _ = service + assert await svc.get(uuid4()) is None + + async def test_list_returns_empty_initially(self, service): + svc, _ = service + assert await svc.list() == [] + + async def test_list_after_submit(self, service): + svc, _ = service + await svc.submit(run_id=None, spec=_spec()) + await svc.submit(run_id=None, spec=_spec()) + runs = await svc.list() + assert len(runs) == 2 + + async def test_cancel_unknown_run_returns_false(self, service): + svc, _ = service + assert await svc.cancel(uuid4()) is False + + +class TestRecordCompletedEval: + """Option A: /api/evaluate synchronously persists runs + results.""" + + def _params(self) -> EvalParams: + return EvalParams(metrics=["m1"]) + + def _run_result(self, *, errors=None, metrics=None) -> RunResult: + return RunResult( + trace_results=[ + TraceResult( + trace_id="trace-1", + num_invocations=1, + metric_results=metrics or [MetricResult(metric_name="m1", eval_status="PASSED", score=0.9)], + ) + ], + errors=errors or [], + ) + + async def test_persists_run_as_succeeded_when_no_errors(self, service): + svc, repos = service + run = await svc.record_completed_eval( + spec=_spec(), + params=self._params(), + run_result=self._run_result(), + ) + assert run.status == RunStatus.SUCCEEDED + listed = await repos.runs.list() + assert len(listed) == 1 + assert listed[0].status == RunStatus.SUCCEEDED + + async def test_persists_run_as_failed_when_errors_present(self, service): + svc, repos = service + run = await svc.record_completed_eval( + spec=_spec(), + params=self._params(), + run_result=self._run_result(errors=["loader failed"]), + ) + assert run.status == RunStatus.FAILED + assert run.error and "loader failed" in run.error + listed = await repos.runs.list() + assert listed[0].status == RunStatus.FAILED + + async def test_persists_result_rows(self, service): + svc, repos = service + run = await svc.record_completed_eval( + spec=_spec(), + params=self._params(), + run_result=self._run_result(), + ) + results = await repos.results.list_by_run(run.run_id) + assert len(results) == 1 + assert results[0].evaluator_name == "m1" + + async def test_summary_attached_to_run(self, service): + svc, _ = service + run = await svc.record_completed_eval( + spec=_spec(), + params=self._params(), + run_result=self._run_result( + metrics=[ + MetricResult(metric_name="m1", eval_status="PASSED"), + MetricResult(metric_name="m2", eval_status="FAILED"), + ] + ), + ) + assert run.summary is not None + assert run.summary["result_counts"]["passed"] == 1 + assert run.summary["result_counts"]["failed"] == 1 + + async def test_each_call_creates_distinct_run(self, service): + svc, repos = service + a = await svc.record_completed_eval(spec=_spec(), params=self._params(), run_result=self._run_result()) + b = await svc.record_completed_eval(spec=_spec(), params=self._params(), run_result=self._run_result()) + assert a.run_id != b.run_id + assert len(await repos.runs.list()) == 2 diff --git a/tests/run/test_sinks.py b/tests/run/test_sinks.py new file mode 100644 index 0000000..39ffd38 --- /dev/null +++ b/tests/run/test_sinks.py @@ -0,0 +1,248 @@ +"""Result sink tests. + +Covers stdout / file sinks fully in-process and HttpWebhookSink against a +mock httpx transport so we exercise retry behavior without touching the network. +""" + +from __future__ import annotations + +import contextlib +import json +from pathlib import Path +from unittest.mock import patch +from uuid import UUID, uuid4 + +import httpx +import pytest + +from agentevals.run.sinks import ( + FileSink, + HttpWebhookSink, + SinkFanout, + StdoutSink, + build_sinks, +) +from agentevals.storage.models import Result, ResultStatus + + +@contextlib.contextmanager +def _mock_async_client(transport: httpx.MockTransport): + """Patch agentevals.run.sinks.httpx.AsyncClient so the sink's + ``async with httpx.AsyncClient(...)`` call routes through the mock + transport. Patching the symbol on the sinks module beats patching + httpx globally, which can leak into other tests.""" + import agentevals.run.sinks as sinks_module + + real = httpx.AsyncClient + + def _factory(*args, **kwargs): + kwargs["transport"] = transport + return real(*args, **kwargs) + + with patch.object(sinks_module.httpx, "AsyncClient", _factory): + yield + + +def _result(run_id: UUID) -> Result: + return Result( + result_id="rid-1", + run_id=run_id, + eval_set_item_id="item-1", + eval_set_item_name="trace-1", + evaluator_name="m1", + evaluator_type="builtin", + status=ResultStatus.PASSED, + score=0.9, + ) + + +class TestFileSink: + async def test_emits_partial_and_final(self, tmp_path): + path = tmp_path / "out.jsonl" + sink = FileSink(path) + run_id = uuid4() + await sink.emit_partial(run_id, [_result(run_id)], attempt=1) + await sink.emit_final(run_id, {"trace_count": 1}, attempt=1) + await sink.emit_error(run_id, "boom", attempt=1) + lines = path.read_text().strip().splitlines() + assert len(lines) == 3 + partial = json.loads(lines[0]) + assert partial["phase"] == "partial" + final = json.loads(lines[1]) + assert final["phase"] == "final" + assert final["summary"] == {"trace_count": 1} + error = json.loads(lines[2]) + assert error["phase"] == "error" + + async def test_creates_parent_directory(self, tmp_path): + path = tmp_path / "deep" / "nested" / "out.jsonl" + sink = FileSink(path) + await sink.emit_final(uuid4(), {}, attempt=1) + assert path.exists() + + +class TestStdoutSink: + async def test_writes_to_stdout(self, capsys): + sink = StdoutSink() + run_id = uuid4() + await sink.emit_partial(run_id, [_result(run_id)], attempt=1) + await sink.emit_final(run_id, {"k": "v"}, attempt=1) + captured = capsys.readouterr().out + lines = captured.strip().splitlines() + assert len(lines) == 2 + assert json.loads(lines[0])["phase"] == "partial" + assert json.loads(lines[1])["phase"] == "final" + + +class TestHttpWebhookSink: + async def test_post_succeeds_on_2xx(self): + captured: list[httpx.Request] = [] + + def handler(request: httpx.Request) -> httpx.Response: + captured.append(request) + return httpx.Response(200, json={}) + + transport = httpx.MockTransport(handler) + sink = HttpWebhookSink("https://h/x") + run_id = uuid4() + with _mock_async_client(transport): + await sink.emit_final(run_id, {"k": "v"}, attempt=1) + assert len(captured) == 1 + body = json.loads(captured[0].content) + assert body["phase"] == "final" + assert body["run_id"] == str(run_id) + + async def test_4xx_does_not_retry(self): + """4xx means the receiver rejected the payload (auth, validation, + etc); retrying would just hammer them. Errors are logged but the + run still completes.""" + calls = 0 + + def handler(request: httpx.Request) -> httpx.Response: + nonlocal calls + calls += 1 + return httpx.Response(401, json={"error": "unauthorized"}) + + transport = httpx.MockTransport(handler) + sink = HttpWebhookSink("https://h/x", max_attempts=5) + with _mock_async_client(transport): + await sink.emit_final(uuid4(), {}, attempt=1) + assert calls == 1 + + async def test_5xx_retries_then_gives_up(self): + calls = 0 + + def handler(request: httpx.Request) -> httpx.Response: + nonlocal calls + calls += 1 + return httpx.Response(503, text="busy") + + transport = httpx.MockTransport(handler) + sink = HttpWebhookSink("https://h/x", max_attempts=3) + with _mock_async_client(transport): + await sink.emit_final(uuid4(), {}, attempt=1) + assert calls == 3 + + async def test_headers_from_env_resolved_at_emit_time(self, monkeypatch): + """Reading env vars at emit time means a host can rotate the auth + token between runs without restarting agentevals.""" + captured: list[dict] = [] + + def handler(request: httpx.Request) -> httpx.Response: + captured.append(dict(request.headers)) + return httpx.Response(200) + + transport = httpx.MockTransport(handler) + sink = HttpWebhookSink( + "https://h/x", + headers={"X-Static": "literal"}, + headers_from_env={"Authorization": "AGENTEVALS_TEST_BEARER"}, + ) + monkeypatch.setenv("AGENTEVALS_TEST_BEARER", "Bearer token-v1") + with _mock_async_client(transport): + await sink.emit_final(uuid4(), {}, attempt=1) + assert captured[0].get("authorization") == "Bearer token-v1" + assert captured[0].get("x-static") == "literal" + + async def test_headers_from_env_skipped_when_unset(self, monkeypatch): + captured: list[dict] = [] + + def handler(request: httpx.Request) -> httpx.Response: + captured.append(dict(request.headers)) + return httpx.Response(200) + + transport = httpx.MockTransport(handler) + sink = HttpWebhookSink( + "https://h/x", + headers_from_env={"Authorization": "AGENTEVALS_TEST_UNSET_VAR"}, + ) + monkeypatch.delenv("AGENTEVALS_TEST_UNSET_VAR", raising=False) + with _mock_async_client(transport): + await sink.emit_final(uuid4(), {}, attempt=1) + assert "authorization" not in captured[0] + + +class TestBuildSinks: + def test_stdout(self): + fanout = build_sinks([{"kind": "stdout"}]) + assert isinstance(fanout, SinkFanout) + + def test_file(self, tmp_path): + fanout = build_sinks([{"kind": "file", "path": str(tmp_path / "x.jsonl")}]) + assert isinstance(fanout, SinkFanout) + + def test_http_webhook_with_auth_env_extraction(self): + fanout = build_sinks( + [ + { + "kind": "http_webhook", + "url": "https://h/x", + "auth": { + "kind": "headers", + "headers": {"Authorization": {"from_env": "MY_TOKEN"}}, + }, + } + ] + ) + assert isinstance(fanout, SinkFanout) + + def test_unknown_kind_skipped_not_raised(self): + """Forward-compat: a host running a newer agentevals replica might + emit a sink kind older replicas don't know. Skipping with a warning + beats crashing the entire run.""" + fanout = build_sinks([{"kind": "future_kind"}, {"kind": "stdout"}]) + assert isinstance(fanout, SinkFanout) + + +class TestSinkFanoutErrorIsolation: + """A sink that raises must not abort other sinks or the run itself.""" + + async def test_failures_logged_not_raised(self, capsys): + class BoomSink: + async def emit_partial(self, run_id, results, attempt): + raise RuntimeError("boom") + + async def emit_final(self, run_id, summary, attempt): + raise RuntimeError("boom-final") + + async def emit_error(self, run_id, error, attempt): + raise RuntimeError("boom-error") + + good_writes = [] + + class GoodSink: + async def emit_partial(self, run_id, results, attempt): + good_writes.append("partial") + + async def emit_final(self, run_id, summary, attempt): + good_writes.append("final") + + async def emit_error(self, run_id, error, attempt): + good_writes.append("error") + + fanout = SinkFanout([BoomSink(), GoodSink()]) + run_id = uuid4() + await fanout.emit_partial(run_id, [], attempt=1) + await fanout.emit_final(run_id, {}, attempt=1) + await fanout.emit_error(run_id, "x", attempt=1) + assert good_writes == ["partial", "final", "error"] diff --git a/tests/storage/__init__.py b/tests/storage/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/storage/test_config.py b/tests/storage/test_config.py new file mode 100644 index 0000000..242d972 --- /dev/null +++ b/tests/storage/test_config.py @@ -0,0 +1,77 @@ +"""StorageSettings env loading and validation.""" + +from __future__ import annotations + +import pytest + +from agentevals.storage.config import StorageSettings + + +class TestStorageSettings: + def test_defaults(self): + s = StorageSettings() + assert s.backend == "memory" + assert s.database_url is None + assert s.schema_name == "agentevals" + assert s.max_concurrent_runs == 4 + + def test_lease_must_exceed_heartbeat(self): + """Catches operator misconfiguration at boot rather than at first + heartbeat: a lease shorter than the heartbeat interval lets workers + steal each other's runs.""" + with pytest.raises(ValueError, match="lease"): + StorageSettings(lease_s=5, heartbeat_s=5) + with pytest.raises(ValueError, match="lease"): + StorageSettings(lease_s=3, heartbeat_s=5) + + def test_postgres_requires_dsn(self): + with pytest.raises(ValueError, match="AGENTEVALS_DATABASE_URL"): + StorageSettings(backend="postgres", database_url=None) + + def test_postgres_with_dsn_ok(self): + s = StorageSettings(backend="postgres", database_url="postgresql://h/db") + assert s.backend == "postgres" + + def test_unknown_backend_rejected(self): + """Pydantic wraps the field_validator's ValueError in a + ValidationError; use the broader match on the inner message.""" + with pytest.raises(Exception, match="unknown storage backend|sqlite"): + StorageSettings(backend="sqlite") + + def test_from_env_reads_defaults(self, monkeypatch): + for var in [ + "AGENTEVALS_STORAGE_BACKEND", + "AGENTEVALS_DATABASE_URL", + "AGENTEVALS_DATABASE_URL_FILE", + "AGENTEVALS_DATABASE_SCHEMA", + "AGENTEVALS_MAX_CONCURRENT_RUNS", + ]: + monkeypatch.delenv(var, raising=False) + s = StorageSettings.from_env() + assert s.backend == "memory" + + def test_from_env_reads_postgres(self, monkeypatch): + monkeypatch.setenv("AGENTEVALS_STORAGE_BACKEND", "postgres") + monkeypatch.setenv("AGENTEVALS_DATABASE_URL", "postgresql://h/db") + monkeypatch.setenv("AGENTEVALS_DATABASE_SCHEMA", "custom_schema") + monkeypatch.setenv("AGENTEVALS_MAX_CONCURRENT_RUNS", "12") + s = StorageSettings.from_env() + assert s.backend == "postgres" + assert s.database_url == "postgresql://h/db" + assert s.schema_name == "custom_schema" + assert s.max_concurrent_runs == 12 + + def test_from_env_url_file_takes_precedence(self, tmp_path, monkeypatch): + dsn_file = tmp_path / "dsn" + dsn_file.write_text("postgresql://from-file/db\n") + monkeypatch.setenv("AGENTEVALS_STORAGE_BACKEND", "postgres") + monkeypatch.setenv("AGENTEVALS_DATABASE_URL", "postgresql://from-env/db") + monkeypatch.setenv("AGENTEVALS_DATABASE_URL_FILE", str(dsn_file)) + s = StorageSettings.from_env() + assert s.database_url == "postgresql://from-file/db" + + def test_from_env_url_file_unreadable_raises(self, tmp_path, monkeypatch): + monkeypatch.setenv("AGENTEVALS_STORAGE_BACKEND", "postgres") + monkeypatch.setenv("AGENTEVALS_DATABASE_URL_FILE", str(tmp_path / "missing")) + with pytest.raises(ValueError, match="unreadable"): + StorageSettings.from_env() diff --git a/tests/storage/test_memory_repos.py b/tests/storage/test_memory_repos.py new file mode 100644 index 0000000..cbb1564 --- /dev/null +++ b/tests/storage/test_memory_repos.py @@ -0,0 +1,226 @@ +"""MemoryRepos behavior tests. + +These exercise the same protocol surface that PostgresRepos implements, so +the test bodies double as a contract that future tests against a live PG can +re-use (parametrize the fixture). +""" + +from __future__ import annotations + +from datetime import datetime, timedelta, timezone +from uuid import UUID, uuid4 + +import pytest + +from agentevals.storage.models import Result, ResultStatus, Run, RunSpec, RunStatus, TraceTarget +from agentevals.storage.repos.memory import MemoryRepos + + +def _make_spec() -> RunSpec: + return RunSpec(approach="trace_replay", target=TraceTarget(kind="inline", inline={"data": []})) + + +def _make_run(run_id: UUID | None = None) -> Run: + return Run(run_id=run_id or uuid4(), status=RunStatus.QUEUED, spec=_make_spec()) + + +@pytest.fixture +def repos(): + return MemoryRepos.create() + + +class TestRunRepository: + async def test_create_and_get(self, repos): + run = _make_run() + await repos.runs.create(run) + fetched = await repos.runs.get(run.run_id) + assert fetched is not None + assert fetched.run_id == run.run_id + assert fetched.status == RunStatus.QUEUED + + async def test_create_idempotent_returns_existing(self, repos): + """Resubmitting the same run_id returns the persisted row, not a new + one; this is what makes POST /api/runs idempotent.""" + run = _make_run() + a = await repos.runs.create(run) + b = await repos.runs.create(run) + assert a.run_id == b.run_id + listed = await repos.runs.list() + assert len(listed) == 1 + + async def test_list_filters_by_status(self, repos): + a = _make_run() + b = _make_run() + await repos.runs.create(a) + await repos.runs.create(b) + await repos.runs.update_status(a.run_id, RunStatus.SUCCEEDED) + succeeded = await repos.runs.list(status=[RunStatus.SUCCEEDED]) + queued = await repos.runs.list(status=[RunStatus.QUEUED]) + assert {r.run_id for r in succeeded} == {a.run_id} + assert {r.run_id for r in queued} == {b.run_id} + + async def test_list_respects_limit(self, repos): + for _ in range(5): + await repos.runs.create(_make_run()) + page = await repos.runs.list(limit=3) + assert len(page) == 3 + + async def test_claim_next_picks_oldest_queued(self, repos): + first = _make_run() + second = _make_run() + await repos.runs.create(first) + await repos.runs.create(second) + claimed = await repos.runs.claim_next(worker_id="w1", lease=timedelta(seconds=30), max_attempts=3) + assert claimed is not None + assert claimed.run_id == first.run_id + assert claimed.status == RunStatus.RUNNING + assert claimed.attempt == 1 + + async def test_claim_next_returns_none_when_empty(self, repos): + result = await repos.runs.claim_next(worker_id="w1", lease=timedelta(seconds=30), max_attempts=3) + assert result is None + + async def test_claim_respects_max_attempts(self, repos): + """A run that has exceeded max_attempts is invisible to claim_next so + a poison run cannot starve fresh queued work via repeated re-claims.""" + run = _make_run() + await repos.runs.create(run) + for _ in range(3): + claimed = await repos.runs.claim_next(worker_id="w1", lease=timedelta(seconds=30), max_attempts=3) + if claimed is None: + break + await repos.runs.update_status(claimed.run_id, RunStatus.QUEUED) + # Reset to QUEUED but with attempt=3 already + run_now = await repos.runs.get(run.run_id) + assert run_now is not None + assert run_now.attempt >= 3 + none_claimed = await repos.runs.claim_next(worker_id="w1", lease=timedelta(seconds=30), max_attempts=3) + assert none_claimed is None + + async def test_heartbeat_returns_false_for_unknown_run(self, repos): + alive = await repos.runs.heartbeat(uuid4(), "w1", timedelta(seconds=30)) + assert alive is False + + async def test_heartbeat_returns_false_when_cancel_requested(self, repos): + run = _make_run() + await repos.runs.create(run) + claimed = await repos.runs.claim_next(worker_id="w1", lease=timedelta(seconds=30), max_attempts=3) + assert claimed is not None + await repos.runs.cancel(claimed.run_id) + alive = await repos.runs.heartbeat(claimed.run_id, "w1", timedelta(seconds=30)) + assert alive is False + + async def test_cancel_queued_run_marks_cancelled(self, repos): + run = _make_run() + await repos.runs.create(run) + ok = await repos.runs.cancel(run.run_id) + assert ok is True + fresh = await repos.runs.get(run.run_id) + assert fresh is not None + assert fresh.status == RunStatus.CANCELLED + + async def test_cancel_running_run_sets_flag_only(self, repos): + run = _make_run() + await repos.runs.create(run) + claimed = await repos.runs.claim_next(worker_id="w1", lease=timedelta(seconds=30), max_attempts=3) + assert claimed is not None + ok = await repos.runs.cancel(claimed.run_id) + assert ok is True + fresh = await repos.runs.get(claimed.run_id) + assert fresh is not None + assert fresh.status == RunStatus.RUNNING + assert fresh.cancel_requested is True + + async def test_cancel_terminal_run_returns_false(self, repos): + run = _make_run() + await repos.runs.create(run) + await repos.runs.update_status(run.run_id, RunStatus.SUCCEEDED) + ok = await repos.runs.cancel(run.run_id) + assert ok is False + + async def test_update_status_sets_finished_at_for_terminal(self, repos): + run = _make_run() + await repos.runs.create(run) + await repos.runs.update_status(run.run_id, RunStatus.SUCCEEDED, summary={"k": "v"}) + fresh = await repos.runs.get(run.run_id) + assert fresh is not None + assert fresh.finished_at is not None + assert fresh.summary == {"k": "v"} + + +class TestResultRepository: + def _make_result(self, run_id: UUID, suffix: str = "") -> Result: + return Result( + result_id=f"hash-{run_id}-{suffix}", + run_id=run_id, + eval_set_item_id=f"item-{suffix}", + eval_set_item_name=f"trace-{suffix}", + evaluator_name="m1", + evaluator_type="builtin", + status=ResultStatus.PASSED, + score=0.9, + ) + + async def test_upsert_many_persists_results(self, repos): + run_id = uuid4() + results = [self._make_result(run_id, "a"), self._make_result(run_id, "b")] + await repos.results.upsert_many(run_id, results) + listed = await repos.results.list_by_run(run_id) + assert len(listed) == 2 + assert {r.result_id for r in listed} == {results[0].result_id, results[1].result_id} + + async def test_upsert_many_idempotent_on_result_id(self, repos): + """Re-upserting the same result_id replaces the row so retried + webhook posts and worker re-execution stay deduplicated.""" + run_id = uuid4() + first = self._make_result(run_id, "a") + await repos.results.upsert_many(run_id, [first]) + first.score = 0.5 + await repos.results.upsert_many(run_id, [first]) + listed = await repos.results.list_by_run(run_id) + assert len(listed) == 1 + assert listed[0].score == 0.5 + + async def test_empty_upsert_is_noop(self, repos): + run_id = uuid4() + await repos.results.upsert_many(run_id, []) + listed = await repos.results.list_by_run(run_id) + assert listed == [] + + async def test_delete_by_run(self, repos): + run_id = uuid4() + await repos.results.upsert_many(run_id, [self._make_result(run_id, "a")]) + await repos.results.delete_by_run(run_id) + assert await repos.results.list_by_run(run_id) == [] + + +class TestSessionRepository: + """SessionRepository is forward-compat scaffolding in this slice; cover + the basic CRUD surface so regressions surface if the protocol drifts.""" + + async def test_upsert_and_get(self, repos): + from agentevals.streaming.session import TraceSession + + s = TraceSession(session_id="sess-1", trace_id="t-1", eval_set_id=None) + s.trace_ids.add("t-1") + await repos.sessions.upsert(s) + fetched = await repos.sessions.get("sess-1") + assert fetched is not None + assert fetched.session_id == "sess-1" + + async def test_find_by_trace_id(self, repos): + from agentevals.streaming.session import TraceSession + + s = TraceSession(session_id="sess-1", trace_id="t-1", eval_set_id=None) + s.trace_ids.update({"t-1", "t-2"}) + await repos.sessions.upsert(s) + match = await repos.sessions.find_by_trace_id("t-2") + assert match is not None + assert match.session_id == "sess-1" + + async def test_delete(self, repos): + from agentevals.streaming.session import TraceSession + + await repos.sessions.upsert(TraceSession(session_id="sess-1", trace_id="t-1", eval_set_id=None)) + await repos.sessions.delete("sess-1") + assert await repos.sessions.get("sess-1") is None diff --git a/tests/storage/test_migrator.py b/tests/storage/test_migrator.py new file mode 100644 index 0000000..e6f1d90 --- /dev/null +++ b/tests/storage/test_migrator.py @@ -0,0 +1,133 @@ +"""Migration runner tests. + +The pure helpers (file discovery + schema substitution) are tested directly. +Live PG behavior is tested only when AGENTEVALS_TEST_DATABASE_URL is set; +otherwise those tests skip so the suite stays runnable in pure-Python sandboxes. +""" + +from __future__ import annotations + +import os +import re + +import pytest + +from agentevals.storage.postgres.migrator import ( + ADVISORY_LOCK_KEY, + Migration, + Migrator, + _apply_schema, + _discover_migrations, + discover_migrations, +) + + +class TestDiscoverMigrations: + def test_finds_baseline(self): + migrations = _discover_migrations() + assert len(migrations) >= 1 + first = migrations[0] + assert first.version == 1 + assert first.name == "init" + assert first.up_sql.strip() + assert first.down_sql is not None and first.down_sql.strip() + + def test_versions_sorted(self): + migrations = _discover_migrations() + versions = [m.version for m in migrations] + assert versions == sorted(versions) + + def test_public_alias_matches(self): + assert [m.version for m in discover_migrations()] == [m.version for m in _discover_migrations()] + + +class TestApplySchema: + def test_substitutes_placeholder(self): + sql = "CREATE TABLE {schema}.foo (id INT)" + assert _apply_schema(sql, "agentevals") == "CREATE TABLE agentevals.foo (id INT)" + + def test_collapses_doubled_braces(self): + """Doubled braces in SQL literals (e.g. JSONB defaults like '{{}}') + collapse to single braces after the {schema} substitution; this + keeps SQL files readable while letting the placeholder expand.""" + sql = "metadata JSONB NOT NULL DEFAULT '{{}}'" + assert _apply_schema(sql, "agentevals") == "metadata JSONB NOT NULL DEFAULT '{}'" + + def test_supports_custom_schema(self): + sql = "CREATE TABLE {schema}.foo (id INT)" + assert _apply_schema(sql, "myteam") == "CREATE TABLE myteam.foo (id INT)" + + def test_rejects_non_identifier_schema(self): + """Defense against SQL injection via schema name. Schema is taken + from an env var which an operator controls but a future bug could + plumb in untrusted input; the regex stops anything but a SQL identifier.""" + with pytest.raises(ValueError, match="invalid schema"): + _apply_schema("CREATE TABLE {schema}.foo", "drop; DROP TABLE users") + + def test_rejects_quoted_schema(self): + with pytest.raises(ValueError, match="invalid schema"): + _apply_schema("X", '"agentevals"') + + +class TestAdvisoryLockKey: + def test_fits_int8(self): + """pg_try_advisory_lock requires an int8; a key wider than that + wraps silently and would collide unpredictably. Lock key chosen at + random; this test only guards against future drift.""" + assert -(2**63) <= ADVISORY_LOCK_KEY < 2**63 + + def test_stable(self): + """Changing the lock key would let two concurrent migrators race. + Only update the key alongside an explicit migration to a new key.""" + assert ADVISORY_LOCK_KEY == 7259820376655812345 + + +class TestMigrationFilePattern: + def test_filename_format(self): + migrations = _discover_migrations() + for m in migrations: + assert isinstance(m, Migration) + assert re.match(r"^[a-z0-9_]+$", m.name) + assert m.version > 0 + + +@pytest.mark.skipif( + not os.environ.get("AGENTEVALS_TEST_DATABASE_URL"), + reason="requires AGENTEVALS_TEST_DATABASE_URL pointing at a disposable Postgres", +) +class TestMigratorLive: + """Apply / no-op replay / version / force / down — all against a real PG. + + Each test creates and drops its own schema so they can run in any order + against the same database without interfering. + """ + + @pytest.fixture + async def migrator(self): + dsn = os.environ["AGENTEVALS_TEST_DATABASE_URL"] + schema = "agentevals_test_migrator" + m = Migrator(dsn=dsn, schema=schema, lock_timeout_s=10) + yield m + # cleanup + try: + await m.down(steps=1) + except Exception: + pass + + async def test_up_then_replay_is_noop(self, migrator): + applied = await migrator.up() + assert applied == [1] + again = await migrator.up() + assert again == [] + + async def test_version_after_up(self, migrator): + await migrator.up() + status = await migrator.status() + assert status.version == 1 + assert status.dirty is False + + async def test_force_clears_dirty(self, migrator): + await migrator.up() + await migrator.force(version=1) + status = await migrator.status() + assert status.dirty is False diff --git a/tests/storage/test_models.py b/tests/storage/test_models.py new file mode 100644 index 0000000..59629d0 --- /dev/null +++ b/tests/storage/test_models.py @@ -0,0 +1,183 @@ +"""Storage model unit tests: pure functions, validation, MetricResult mapping.""" + +from __future__ import annotations + +import hashlib +from uuid import UUID + +import pytest + +from agentevals.runner import MetricResult +from agentevals.storage.models import ( + Result, + ResultStatus, + Run, + RunSpec, + RunStatus, + TraceTarget, + compute_result_id, +) + + +class TestComputeResultId: + def test_deterministic(self): + a = compute_result_id("00000000-0000-0000-0000-000000000001", "item-x", "metric-y") + b = compute_result_id("00000000-0000-0000-0000-000000000001", "item-x", "metric-y") + assert a == b + + def test_uuid_lowercased(self): + upper = compute_result_id("00000000-0000-0000-0000-00000000ABCD", "item", "m") + lower = compute_result_id("00000000-0000-0000-0000-00000000abcd", "item", "m") + assert upper == lower + + def test_uuid_object_and_string_match(self): + u = UUID("00000000-0000-0000-0000-000000000001") + assert compute_result_id(u, "item", "m") == compute_result_id(str(u), "item", "m") + + def test_pipe_delimiter_byte_spec(self): + """Locks the canonical formula so producer (Python) and any future + consumer agree byte-for-byte. Any change here is a breaking change.""" + expected = hashlib.sha256(b"abc|item|m").hexdigest() + assert compute_result_id("abc", "item", "m") == expected + + +class TestTraceTargetValidation: + def test_inline(self): + t = TraceTarget(kind="inline", inline={"data": []}) + assert t.kind == "inline" + + def test_http_with_base_url(self): + t = TraceTarget(kind="http", base_url="https://example/", trace_id="abc") + assert t.base_url == "https://example/" + assert t.trace_id == "abc" + + def test_uploaded_with_audit_metadata(self): + t = TraceTarget(kind="uploaded", trace_count=2, trace_files=["a.json", "b.json"]) + assert t.kind == "uploaded" + assert t.trace_count == 2 + assert t.trace_files == ["a.json", "b.json"] + + def test_unknown_kind_rejected(self): + from pydantic import ValidationError + + with pytest.raises(ValidationError): + TraceTarget(kind="not-a-kind") + + +class TestRunSpec: + def test_minimal_inline_spec(self): + spec = RunSpec(approach="trace_replay", target=TraceTarget(kind="inline", inline={})) + assert spec.approach == "trace_replay" + assert spec.target.kind == "inline" + assert spec.eval_set is None + assert spec.eval_config == {} + assert spec.sinks == [] + assert spec.context == {} + + def test_extra_fields_allowed_for_forward_compat(self): + """RunSpec uses extra='allow' so a host can include forward-compatible + metadata without breaking older agentevals replicas.""" + spec = RunSpec.model_validate( + { + "approach": "trace_replay", + "target": {"kind": "inline", "inline": {}}, + "futureField": "unknown", + } + ) + assert spec.target.kind == "inline" + + +class TestResultFromMetricResult: + """Locks the renaming + status-mapping behavior between the in-pipeline + MetricResult shape and the persisted Result shape.""" + + def _mr(self, **overrides): + defaults = dict( + metric_name="tool_trajectory_avg_score", + score=0.8, + eval_status="PASSED", + per_invocation_scores=[1.0, 0.6], + error=None, + details={"foo": "bar"}, + duration_ms=42.5, + ) + defaults.update(overrides) + return MetricResult(**defaults) + + def _build(self, mr): + return Result.from_metric_result( + run_id=UUID("00000000-0000-0000-0000-000000000001"), + eval_set_item_id="item-1", + eval_set_item_name="trace-abc", + trace_id="trace-abc", + evaluator_type="builtin", + metric_result=mr, + ) + + def test_passed_maps_to_passed(self): + r = self._build(self._mr(eval_status="PASSED")) + assert r.status == ResultStatus.PASSED + assert r.score == 0.8 + assert r.evaluator_name == "tool_trajectory_avg_score" + assert r.evaluator_type == "builtin" + assert r.eval_set_item_id == "item-1" + assert r.trace_id == "trace-abc" + + def test_failed_maps_to_failed(self): + r = self._build(self._mr(eval_status="FAILED")) + assert r.status == ResultStatus.FAILED + + def test_not_evaluated_maps_to_skipped(self): + r = self._build(self._mr(eval_status="NOT_EVALUATED", score=None, per_invocation_scores=[])) + assert r.status == ResultStatus.SKIPPED + + def test_unknown_status_maps_to_skipped(self): + """Defensive: ADK sometimes emits non-standard status strings; + anything unknown should land as skipped, not crash.""" + r = self._build(self._mr(eval_status="MAYBE_PASSED")) + assert r.status == ResultStatus.SKIPPED + + def test_error_dominates_status(self): + """Even if eval_status says PASSED, a non-empty error means + the row should land as 'errored' so downstream consumers can + filter cleanly without special-casing the error field.""" + r = self._build(self._mr(eval_status="PASSED", error="boom")) + assert r.status == ResultStatus.ERRORED + assert r.error_text == "boom" + + def test_duration_ms_renamed_to_latency_ms(self): + r = self._build(self._mr(duration_ms=42.7)) + assert r.latency_ms == 42 # int truncation matches the schema column type + + def test_latency_ms_none_when_duration_missing(self): + r = self._build(self._mr(duration_ms=None)) + assert r.latency_ms is None + + def test_per_invocation_scores_preserved(self): + r = self._build(self._mr(per_invocation_scores=[0.0, 0.5, 1.0])) + assert r.per_invocation_scores == [0.0, 0.5, 1.0] + + def test_details_default_to_empty_dict(self): + r = self._build(self._mr(details=None)) + assert r.details == {} + + def test_result_id_matches_canonical_formula(self): + r = self._build(self._mr()) + expected = compute_result_id( + UUID("00000000-0000-0000-0000-000000000001"), + "item-1", + "tool_trajectory_avg_score", + ) + assert r.result_id == expected + + +class TestRun: + def test_default_status_and_attempt(self): + run = Run( + run_id=UUID("00000000-0000-0000-0000-000000000001"), + status=RunStatus.QUEUED, + spec=RunSpec(approach="trace_replay", target=TraceTarget(kind="inline", inline={})), + ) + assert run.attempt == 0 + assert run.cancel_requested is False + assert run.error is None diff --git a/uv.lock b/uv.lock index 8a573ec..2d42db3 100644 --- a/uv.lock +++ b/uv.lock @@ -53,6 +53,9 @@ live = [ openai = [ { name = "openai" }, ] +postgres = [ + { name = "asyncpg" }, +] streaming = [ { name = "opentelemetry-sdk" }, { name = "websockets" }, @@ -80,6 +83,7 @@ e2e = [ [package.metadata] requires-dist = [ + { name = "asyncpg", marker = "extra == 'postgres'", specifier = ">=0.30.0" }, { name = "click", specifier = ">=8.0" }, { name = "fastapi", specifier = ">=0.115.0" }, { name = "google-adk", extras = ["eval"], specifier = ">=1.30.0" }, @@ -96,7 +100,7 @@ requires-dist = [ { name = "uvicorn", extras = ["standard"], specifier = ">=0.32.0" }, { name = "websockets", marker = "extra == 'streaming'", specifier = ">=12.0" }, ] -provides-extras = ["live", "streaming", "openai"] +provides-extras = ["live", "streaming", "openai", "postgres"] [package.metadata.requires-dev] dev = [ @@ -307,6 +311,54 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/38/0e/27be9fdef66e72d64c0cdc3cc2823101b80585f8119b5c112c2e8f5f7dab/anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c", size = 113592, upload-time = "2026-01-06T11:45:19.497Z" }, ] +[[package]] +name = "asyncpg" +version = "0.31.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fe/cc/d18065ce2380d80b1bcce927c24a2642efd38918e33fd724bc4bca904877/asyncpg-0.31.0.tar.gz", hash = "sha256:c989386c83940bfbd787180f2b1519415e2d3d6277a70d9d0f0145ac73500735", size = 993667, upload-time = "2025-11-24T23:27:00.812Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/17/cc02bc49bc350623d050fa139e34ea512cd6e020562f2a7312a7bcae4bc9/asyncpg-0.31.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:eee690960e8ab85063ba93af2ce128c0f52fd655fdff9fdb1a28df01329f031d", size = 643159, upload-time = "2025-11-24T23:25:36.443Z" }, + { url = "https://files.pythonhosted.org/packages/a4/62/4ded7d400a7b651adf06f49ea8f73100cca07c6df012119594d1e3447aa6/asyncpg-0.31.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2657204552b75f8288de08ca60faf4a99a65deef3a71d1467454123205a88fab", size = 638157, upload-time = "2025-11-24T23:25:37.89Z" }, + { url = "https://files.pythonhosted.org/packages/d6/5b/4179538a9a72166a0bf60ad783b1ef16efb7960e4d7b9afe9f77a5551680/asyncpg-0.31.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a429e842a3a4b4ea240ea52d7fe3f82d5149853249306f7ff166cb9948faa46c", size = 2918051, upload-time = "2025-11-24T23:25:39.461Z" }, + { url = "https://files.pythonhosted.org/packages/e6/35/c27719ae0536c5b6e61e4701391ffe435ef59539e9360959240d6e47c8c8/asyncpg-0.31.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c0807be46c32c963ae40d329b3a686356e417f674c976c07fa49f1b30303f109", size = 2972640, upload-time = "2025-11-24T23:25:41.512Z" }, + { url = "https://files.pythonhosted.org/packages/43/f4/01ebb9207f29e645a64699b9ce0eefeff8e7a33494e1d29bb53736f7766b/asyncpg-0.31.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e5d5098f63beeae93512ee513d4c0c53dc12e9aa2b7a1af5a81cddf93fe4e4da", size = 2851050, upload-time = "2025-11-24T23:25:43.153Z" }, + { url = "https://files.pythonhosted.org/packages/3e/f4/03ff1426acc87be0f4e8d40fa2bff5c3952bef0080062af9efc2212e3be8/asyncpg-0.31.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37fc6c00a814e18eef51833545d1891cac9aa69140598bb076b4cd29b3e010b9", size = 2962574, upload-time = "2025-11-24T23:25:44.942Z" }, + { url = "https://files.pythonhosted.org/packages/c7/39/cc788dfca3d4060f9d93e67be396ceec458dfc429e26139059e58c2c244d/asyncpg-0.31.0-cp311-cp311-win32.whl", hash = "sha256:5a4af56edf82a701aece93190cc4e094d2df7d33f6e915c222fb09efbb5afc24", size = 521076, upload-time = "2025-11-24T23:25:46.486Z" }, + { url = "https://files.pythonhosted.org/packages/28/fc/735af5384c029eb7f1ca60ccb8fa95521dbdaeef788edf4cecfc604c3cab/asyncpg-0.31.0-cp311-cp311-win_amd64.whl", hash = "sha256:480c4befbdf079c14c9ca43c8c5e1fe8b6296c96f1f927158d4f1e750aacc047", size = 584980, upload-time = "2025-11-24T23:25:47.938Z" }, + { url = "https://files.pythonhosted.org/packages/2a/a6/59d0a146e61d20e18db7396583242e32e0f120693b67a8de43f1557033e2/asyncpg-0.31.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b44c31e1efc1c15188ef183f287c728e2046abb1d26af4d20858215d50d91fad", size = 662042, upload-time = "2025-11-24T23:25:49.578Z" }, + { url = "https://files.pythonhosted.org/packages/36/01/ffaa189dcb63a2471720615e60185c3f6327716fdc0fc04334436fbb7c65/asyncpg-0.31.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0c89ccf741c067614c9b5fc7f1fc6f3b61ab05ae4aaa966e6fd6b93097c7d20d", size = 638504, upload-time = "2025-11-24T23:25:51.501Z" }, + { url = "https://files.pythonhosted.org/packages/9f/62/3f699ba45d8bd24c5d65392190d19656d74ff0185f42e19d0bbd973bb371/asyncpg-0.31.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:12b3b2e39dc5470abd5e98c8d3373e4b1d1234d9fbdedf538798b2c13c64460a", size = 3426241, upload-time = "2025-11-24T23:25:53.278Z" }, + { url = "https://files.pythonhosted.org/packages/8c/d1/a867c2150f9c6e7af6462637f613ba67f78a314b00db220cd26ff559d532/asyncpg-0.31.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:aad7a33913fb8bcb5454313377cc330fbb19a0cd5faa7272407d8a0c4257b671", size = 3520321, upload-time = "2025-11-24T23:25:54.982Z" }, + { url = "https://files.pythonhosted.org/packages/7a/1a/cce4c3f246805ecd285a3591222a2611141f1669d002163abef999b60f98/asyncpg-0.31.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3df118d94f46d85b2e434fd62c84cb66d5834d5a890725fe625f498e72e4d5ec", size = 3316685, upload-time = "2025-11-24T23:25:57.43Z" }, + { url = "https://files.pythonhosted.org/packages/40/ae/0fc961179e78cc579e138fad6eb580448ecae64908f95b8cb8ee2f241f67/asyncpg-0.31.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bd5b6efff3c17c3202d4b37189969acf8927438a238c6257f66be3c426beba20", size = 3471858, upload-time = "2025-11-24T23:25:59.636Z" }, + { url = "https://files.pythonhosted.org/packages/52/b2/b20e09670be031afa4cbfabd645caece7f85ec62d69c312239de568e058e/asyncpg-0.31.0-cp312-cp312-win32.whl", hash = "sha256:027eaa61361ec735926566f995d959ade4796f6a49d3bde17e5134b9964f9ba8", size = 527852, upload-time = "2025-11-24T23:26:01.084Z" }, + { url = "https://files.pythonhosted.org/packages/b5/f0/f2ed1de154e15b107dc692262395b3c17fc34eafe2a78fc2115931561730/asyncpg-0.31.0-cp312-cp312-win_amd64.whl", hash = "sha256:72d6bdcbc93d608a1158f17932de2321f68b1a967a13e014998db87a72ed3186", size = 597175, upload-time = "2025-11-24T23:26:02.564Z" }, + { url = "https://files.pythonhosted.org/packages/95/11/97b5c2af72a5d0b9bc3fa30cd4b9ce22284a9a943a150fdc768763caf035/asyncpg-0.31.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c204fab1b91e08b0f47e90a75d1b3c62174dab21f670ad6c5d0f243a228f015b", size = 661111, upload-time = "2025-11-24T23:26:04.467Z" }, + { url = "https://files.pythonhosted.org/packages/1b/71/157d611c791a5e2d0423f09f027bd499935f0906e0c2a416ce712ba51ef3/asyncpg-0.31.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:54a64f91839ba59008eccf7aad2e93d6e3de688d796f35803235ea1c4898ae1e", size = 636928, upload-time = "2025-11-24T23:26:05.944Z" }, + { url = "https://files.pythonhosted.org/packages/2e/fc/9e3486fb2bbe69d4a867c0b76d68542650a7ff1574ca40e84c3111bb0c6e/asyncpg-0.31.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c0e0822b1038dc7253b337b0f3f676cadc4ac31b126c5d42691c39691962e403", size = 3424067, upload-time = "2025-11-24T23:26:07.957Z" }, + { url = "https://files.pythonhosted.org/packages/12/c6/8c9d076f73f07f995013c791e018a1cd5f31823c2a3187fc8581706aa00f/asyncpg-0.31.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bef056aa502ee34204c161c72ca1f3c274917596877f825968368b2c33f585f4", size = 3518156, upload-time = "2025-11-24T23:26:09.591Z" }, + { url = "https://files.pythonhosted.org/packages/ae/3b/60683a0baf50fbc546499cfb53132cb6835b92b529a05f6a81471ab60d0c/asyncpg-0.31.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0bfbcc5b7ffcd9b75ab1558f00db2ae07db9c80637ad1b2469c43df79d7a5ae2", size = 3319636, upload-time = "2025-11-24T23:26:11.168Z" }, + { url = "https://files.pythonhosted.org/packages/50/dc/8487df0f69bd398a61e1792b3cba0e47477f214eff085ba0efa7eac9ce87/asyncpg-0.31.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:22bc525ebbdc24d1261ecbf6f504998244d4e3be1721784b5f64664d61fbe602", size = 3472079, upload-time = "2025-11-24T23:26:13.164Z" }, + { url = "https://files.pythonhosted.org/packages/13/a1/c5bbeeb8531c05c89135cb8b28575ac2fac618bcb60119ee9696c3faf71c/asyncpg-0.31.0-cp313-cp313-win32.whl", hash = "sha256:f890de5e1e4f7e14023619399a471ce4b71f5418cd67a51853b9910fdfa73696", size = 527606, upload-time = "2025-11-24T23:26:14.78Z" }, + { url = "https://files.pythonhosted.org/packages/91/66/b25ccb84a246b470eb943b0107c07edcae51804912b824054b3413995a10/asyncpg-0.31.0-cp313-cp313-win_amd64.whl", hash = "sha256:dc5f2fa9916f292e5c5c8b2ac2813763bcd7f58e130055b4ad8a0531314201ab", size = 596569, upload-time = "2025-11-24T23:26:16.189Z" }, + { url = "https://files.pythonhosted.org/packages/3c/36/e9450d62e84a13aea6580c83a47a437f26c7ca6fa0f0fd40b6670793ea30/asyncpg-0.31.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:f6b56b91bb0ffc328c4e3ed113136cddd9deefdf5f79ab448598b9772831df44", size = 660867, upload-time = "2025-11-24T23:26:17.631Z" }, + { url = "https://files.pythonhosted.org/packages/82/4b/1d0a2b33b3102d210439338e1beea616a6122267c0df459ff0265cd5807a/asyncpg-0.31.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:334dec28cf20d7f5bb9e45b39546ddf247f8042a690bff9b9573d00086e69cb5", size = 638349, upload-time = "2025-11-24T23:26:19.689Z" }, + { url = "https://files.pythonhosted.org/packages/41/aa/e7f7ac9a7974f08eff9183e392b2d62516f90412686532d27e196c0f0eeb/asyncpg-0.31.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:98cc158c53f46de7bb677fd20c417e264fc02b36d901cc2a43bd6cb0dc6dbfd2", size = 3410428, upload-time = "2025-11-24T23:26:21.275Z" }, + { url = "https://files.pythonhosted.org/packages/6f/de/bf1b60de3dede5c2731e6788617a512bc0ebd9693eac297ee74086f101d7/asyncpg-0.31.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9322b563e2661a52e3cdbc93eed3be7748b289f792e0011cb2720d278b366ce2", size = 3471678, upload-time = "2025-11-24T23:26:23.627Z" }, + { url = "https://files.pythonhosted.org/packages/46/78/fc3ade003e22d8bd53aaf8f75f4be48f0b460fa73738f0391b9c856a9147/asyncpg-0.31.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:19857a358fc811d82227449b7ca40afb46e75b33eb8897240c3839dd8b744218", size = 3313505, upload-time = "2025-11-24T23:26:25.235Z" }, + { url = "https://files.pythonhosted.org/packages/bf/e9/73eb8a6789e927816f4705291be21f2225687bfa97321e40cd23055e903a/asyncpg-0.31.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ba5f8886e850882ff2c2ace5732300e99193823e8107e2c53ef01c1ebfa1e85d", size = 3434744, upload-time = "2025-11-24T23:26:26.944Z" }, + { url = "https://files.pythonhosted.org/packages/08/4b/f10b880534413c65c5b5862f79b8e81553a8f364e5238832ad4c0af71b7f/asyncpg-0.31.0-cp314-cp314-win32.whl", hash = "sha256:cea3a0b2a14f95834cee29432e4ddc399b95700eb1d51bbc5bfee8f31fa07b2b", size = 532251, upload-time = "2025-11-24T23:26:28.404Z" }, + { url = "https://files.pythonhosted.org/packages/d3/2d/7aa40750b7a19efa5d66e67fc06008ca0f27ba1bd082e457ad82f59aba49/asyncpg-0.31.0-cp314-cp314-win_amd64.whl", hash = "sha256:04d19392716af6b029411a0264d92093b6e5e8285ae97a39957b9a9c14ea72be", size = 604901, upload-time = "2025-11-24T23:26:30.34Z" }, + { url = "https://files.pythonhosted.org/packages/ce/fe/b9dfe349b83b9dee28cc42360d2c86b2cdce4cb551a2c2d27e156bcac84d/asyncpg-0.31.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:bdb957706da132e982cc6856bb2f7b740603472b54c3ebc77fe60ea3e57e1bd2", size = 702280, upload-time = "2025-11-24T23:26:32Z" }, + { url = "https://files.pythonhosted.org/packages/6a/81/e6be6e37e560bd91e6c23ea8a6138a04fd057b08cf63d3c5055c98e81c1d/asyncpg-0.31.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:6d11b198111a72f47154fa03b85799f9be63701e068b43f84ac25da0bda9cb31", size = 682931, upload-time = "2025-11-24T23:26:33.572Z" }, + { url = "https://files.pythonhosted.org/packages/a6/45/6009040da85a1648dd5bc75b3b0a062081c483e75a1a29041ae63a0bf0dc/asyncpg-0.31.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:18c83b03bc0d1b23e6230f5bf8d4f217dc9bc08644ce0502a9d91dc9e634a9c7", size = 3581608, upload-time = "2025-11-24T23:26:35.638Z" }, + { url = "https://files.pythonhosted.org/packages/7e/06/2e3d4d7608b0b2b3adbee0d0bd6a2d29ca0fc4d8a78f8277df04e2d1fd7b/asyncpg-0.31.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e009abc333464ff18b8f6fd146addffd9aaf63e79aa3bb40ab7a4c332d0c5e9e", size = 3498738, upload-time = "2025-11-24T23:26:37.275Z" }, + { url = "https://files.pythonhosted.org/packages/7d/aa/7d75ede780033141c51d83577ea23236ba7d3a23593929b32b49db8ed36e/asyncpg-0.31.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3b1fbcb0e396a5ca435a8826a87e5c2c2cc0c8c68eb6fadf82168056b0e53a8c", size = 3401026, upload-time = "2025-11-24T23:26:39.423Z" }, + { url = "https://files.pythonhosted.org/packages/ba/7a/15e37d45e7f7c94facc1e9148c0e455e8f33c08f0b8a0b1deb2c5171771b/asyncpg-0.31.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8df714dba348efcc162d2adf02d213e5fab1bd9f557e1305633e851a61814a7a", size = 3429426, upload-time = "2025-11-24T23:26:41.032Z" }, + { url = "https://files.pythonhosted.org/packages/13/d5/71437c5f6ae5f307828710efbe62163974e71237d5d46ebd2869ea052d10/asyncpg-0.31.0-cp314-cp314t-win32.whl", hash = "sha256:1b41f1afb1033f2b44f3234993b15096ddc9cd71b21a42dbd87fc6a57b43d65d", size = 614495, upload-time = "2025-11-24T23:26:42.659Z" }, + { url = "https://files.pythonhosted.org/packages/3c/d7/8fb3044eaef08a310acfe23dae9a8e2e07d305edc29a53497e52bc76eca7/asyncpg-0.31.0-cp314-cp314t-win_amd64.whl", hash = "sha256:bd4107bb7cdd0e9e65fae66a62afd3a249663b844fa34d479f6d5b3bef9c04c3", size = 706062, upload-time = "2025-11-24T23:26:44.086Z" }, +] + [[package]] name = "attrs" version = "25.4.0"