diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index e7a73ed..4c76143 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -23,10 +23,33 @@ make dev-frontend      # start Vite dev server (port 5173) with HMR
 make dev-bundle        # build UI, serve full bundled experience at port 8001 via uv run
 ```
 
-Standard development uses `dev-backend` + `dev-frontend` in separate terminals. The Vite dev server proxies nothing — the frontend calls the backend at `http://localhost:8001` directly via CORS.
+Standard development uses `dev-backend` + `dev-frontend` in separate terminals. The Vite dev server proxies nothing; the frontend calls the backend at `http://localhost:8001` directly via CORS.
 
 `dev-bundle` is useful for testing the bundled UI experience without building a wheel. It copies `ui/dist` into the source tree temporarily and cleans up when the server exits.
 
+### Postgres backend (optional, for `/api/runs`)
+
+The default in-memory backend keeps `make dev-backend` zero-config. To exercise the async run pipeline locally, bring up a Postgres alongside the app:
+
+```bash
+make pg-up             # start postgres:17-alpine in a docker container (port 5432, ephemeral via --rm)
+make migrate           # apply the agentevals schema
+make dev-backend-pg    # pg-up + migrate + serve --dev with backend=postgres wired up
+make pg-down           # stop the container; data is discarded with --rm
+```
+
+Override the defaults via `PG_PORT=5433 make pg-up` etc. The `migrate` target is idempotent (a second invocation is a no-op).
+
+Once running, submit a run with:
+
+```bash
+curl -X POST http://localhost:8001/api/runs \
+    -H 'content-type: application/json' \
+    -d '{"spec": {"approach": "trace_replay", "target": {"kind": "inline", "inline": {...}}, "evalConfig": {"metrics": ["tool_trajectory_avg_score"]}}}'
+```
+
+Then poll `GET /api/runs/{runId}` and `GET /api/runs/{runId}/results`. Without `storage.backend=postgres`, the `/api/runs` endpoints return 503 with a hint pointing at the env var.
+
 ### Building
 
 ```bash
diff --git a/Dockerfile b/Dockerfile
index f1eb0d8..d43d63c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -24,7 +24,7 @@ COPY src ./src
 
 COPY --from=ui /build/ui/dist ./src/agentevals/_static
 
-RUN uv sync --frozen --no-dev --extra live \
+RUN uv sync --frozen --no-dev --extra live --extra postgres \
     && groupadd --gid 1000 app \
     && useradd --uid 1000 --gid app --home-dir /app --no-log-init app \
     && chown -R app:app /app
diff --git a/Makefile b/Makefile
index cee2922..32147f9 100644
--- a/Makefile
+++ b/Makefile
@@ -15,7 +15,14 @@ HELM_CHART_DIR ?= charts/agentevals
 HELM_CHART_OCI_URL ?= $(HELM_REPO)/helm
 HELM_CHART_VERSION ?= $(VERSION)
 
-.PHONY: build build-bundle build-docker build-ui release clean dev-backend dev-frontend dev-bundle test test-unit test-integration test-e2e helm-lint helm-template helm-test helm-cleanup helm-package helm-publish
+.PHONY: build build-bundle build-docker build-ui release clean dev-backend dev-backend-pg dev-frontend dev-bundle pg-up pg-down migrate test test-unit test-integration test-e2e helm-lint helm-template helm-test helm-cleanup helm-package helm-publish
+
+PG_CONTAINER ?= agentevals-pg
+PG_PORT      ?= 5432
+PG_USER      ?= agentevals
+PG_PASSWORD  ?= agentevals
+PG_DATABASE  ?= agentevals
+PG_DSN       ?= postgresql://$(PG_USER):$(PG_PASSWORD)@localhost:$(PG_PORT)/$(PG_DATABASE)
 
 build:
 	uv build
@@ -53,6 +60,30 @@ release: clean build-ui
 dev-backend:
 	uv run agentevals serve --dev
 
+pg-up:
+	@if [ -z "$$(docker ps -q -f name=^/$(PG_CONTAINER)$$)" ]; then \
+		docker run -d --rm --name $(PG_CONTAINER) \
+			-e POSTGRES_USER=$(PG_USER) \
+			-e POSTGRES_PASSWORD=$(PG_PASSWORD) \
+			-e POSTGRES_DB=$(PG_DATABASE) \
+			-p $(PG_PORT):5432 postgres:17-alpine; \
+	else \
+		echo "container $(PG_CONTAINER) already running"; \
+	fi
+	@until docker exec $(PG_CONTAINER) pg_isready -U $(PG_USER) >/dev/null 2>&1; do sleep 1; done
+	@echo "Postgres ready at $(PG_DSN)"
+
+pg-down:
+	-docker stop $(PG_CONTAINER)
+
+migrate:
+	AGENTEVALS_DATABASE_URL=$(PG_DSN) uv run agentevals migrate up
+
+dev-backend-pg: pg-up migrate
+	AGENTEVALS_STORAGE_BACKEND=postgres \
+	AGENTEVALS_DATABASE_URL=$(PG_DSN) \
+	uv run agentevals serve --dev
+
 dev-frontend:
 	cd ui && npm run dev
 
diff --git a/README.md b/README.md
index c25b278..52b2e91 100644
--- a/README.md
+++ b/README.md
@@ -286,6 +286,24 @@ The source for the chart lives in [`charts/agentevals/`](charts/agentevals/) if
 
 See the [Kubernetes example](examples/kubernetes/README.md) for an end-to-end walkthrough deploying agentevals alongside kagent and an OTel Collector on Kubernetes.
 
+#### Postgres backend (`/api/runs`)
+
+By default the chart deploys agentevals with an in-memory backend; runs and results are not persisted. To enable the async `POST /api/runs` pipeline with durable Postgres-backed state:
+
+```bash
+# Bundled Postgres (dev / evaluation only):
+helm install agentevals oci://ghcr.io/agentevals-dev/agentevals/helm/agentevals \
+    --set storage.backend=postgres \
+    --set database.postgres.bundled.enabled=true
+
+# Or supply an external Postgres DSN:
+helm install agentevals oci://ghcr.io/agentevals-dev/agentevals/helm/agentevals \
+    --set storage.backend=postgres \
+    --set database.postgres.url='postgresql://user:pass@host:5432/dbname'
+```
+
+When `storage.backend=postgres` the app applies any pending schema migrations on startup (advisory-lock protected, safe across replicas) and starts an in-process worker that processes the run queue. Without `storage.backend=postgres` the `/api/runs` endpoints return 503 with a hint pointing at the env var.
+
 ## MCP Server
 
 Exposes evaluation tools to MCP clients. A `.mcp.json` at the project root lets Claude Code pick it up automatically.
diff --git a/charts/agentevals/templates/_helpers.tpl b/charts/agentevals/templates/_helpers.tpl
index 13f3cc6..6672e29 100644
--- a/charts/agentevals/templates/_helpers.tpl
+++ b/charts/agentevals/templates/_helpers.tpl
@@ -48,6 +48,17 @@ app.kubernetes.io/name: {{ include "agentevals.name" . }}
 app.kubernetes.io/instance: {{ .Release.Name }}
 {{- end }}
 
+{{- /*
+Selector labels scoped to the main app Pod and its Service. Carries the
+``app.kubernetes.io/component: agentevals`` discriminator so the agentevals
+Service does not also match the bundled Postgres Pod (which carries
+``app.kubernetes.io/component: database`` instead).
+*/ -}}
+{{- define "agentevals.app.selectorLabels" -}}
+{{ include "agentevals.selectorLabels" . }}
+app.kubernetes.io/component: agentevals
+{{- end }}
+
 {{- define "agentevals.serviceAccountName" -}}
 {{- if .Values.serviceAccount.create }}
 {{- default (include "agentevals.fullname" .) .Values.serviceAccount.name }}
@@ -55,3 +66,25 @@ app.kubernetes.io/instance: {{ .Release.Name }}
 {{- default "default" .Values.serviceAccount.name }}
 {{- end }}
 {{- end }}
+
+{{/*
+Service name for the bundled Postgres instance.
+*/}}
+{{- define "agentevals.postgresqlServiceName" -}}
+{{- printf "%s-postgresql" (include "agentevals.fullname" .) -}}
+{{- end -}}
+
+{{/*
+Bundled Postgres image reference (registry/repository/name:tag).
+*/}}
+{{- define "agentevals.postgresql.image" -}}
+{{- $pg := .Values.database.postgres.bundled -}}
+{{- printf "%s/%s/%s:%s" $pg.image.registry $pg.image.repository $pg.image.name $pg.image.tag -}}
+{{- end -}}
+
+{{/*
+Secret name holding POSTGRES_PASSWORD for the bundled Postgres instance.
+*/}}
+{{- define "agentevals.passwordSecretName" -}}
+{{- printf "%s-postgresql" (include "agentevals.fullname" .) -}}
+{{- end -}}
diff --git a/charts/agentevals/templates/deployment.yaml b/charts/agentevals/templates/deployment.yaml
index 3a56b25..e8852dc 100644
--- a/charts/agentevals/templates/deployment.yaml
+++ b/charts/agentevals/templates/deployment.yaml
@@ -9,7 +9,7 @@ spec:
   replicas: {{ .Values.replicaCount }}
   selector:
     matchLabels:
-      {{- include "agentevals.selectorLabels" . | nindent 6 }}
+      {{- include "agentevals.app.selectorLabels" . | nindent 6 }}
   template:
     metadata:
       {{- with .Values.podAnnotations }}
@@ -17,7 +17,7 @@ spec:
         {{- toYaml . | nindent 8 }}
       {{- end }}
       labels:
-        {{- include "agentevals.selectorLabels" . | nindent 8 }}
+        {{- include "agentevals.app.selectorLabels" . | nindent 8 }}
         {{- with .Values.podLabels }}
         {{- toYaml . | nindent 8 }}
         {{- end }}
@@ -65,6 +65,29 @@ spec:
             - name: HOME
               value: "/tmp/agentevals-home"
             {{- end }}
+            {{- if eq .Values.storage.backend "postgres" }}
+            - name: AGENTEVALS_STORAGE_BACKEND
+              value: "postgres"
+            - name: AGENTEVALS_DATABASE_SCHEMA
+              value: {{ .Values.database.postgres.schema | quote }}
+            {{- if .Values.database.postgres.urlFile }}
+            - name: AGENTEVALS_DATABASE_URL_FILE
+              value: {{ .Values.database.postgres.urlFile | quote }}
+            {{- else if .Values.database.postgres.url }}
+            - name: AGENTEVALS_DATABASE_URL
+              value: {{ .Values.database.postgres.url | quote }}
+            {{- else if .Values.database.postgres.bundled.enabled }}
+            - name: POSTGRES_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: {{ include "agentevals.passwordSecretName" . }}
+                  key: POSTGRES_PASSWORD
+            - name: AGENTEVALS_DATABASE_URL
+              value: {{ printf "postgresql://agentevals:$(POSTGRES_PASSWORD)@%s.%s.svc.cluster.local:5432/agentevals?sslmode=disable" (include "agentevals.postgresqlServiceName" .) (include "agentevals.namespace" .) | quote }}
+            {{- else }}
+            {{ fail "storage.backend=postgres requires database.postgres.url, database.postgres.urlFile, or database.postgres.bundled.enabled=true" }}
+            {{- end }}
+            {{- end }}
             {{- with .Values.env }}
             {{- toYaml . | nindent 12 }}
             {{- end }}
diff --git a/charts/agentevals/templates/postgresql-secret.yaml b/charts/agentevals/templates/postgresql-secret.yaml
new file mode 100644
index 0000000..21daab6
--- /dev/null
+++ b/charts/agentevals/templates/postgresql-secret.yaml
@@ -0,0 +1,13 @@
+{{- if and (eq .Values.storage.backend "postgres") .Values.database.postgres.bundled.enabled (not .Values.database.postgres.url) (not .Values.database.postgres.urlFile) }}
+apiVersion: v1
+kind: Secret
+metadata:
+  name: {{ include "agentevals.passwordSecretName" . }}
+  namespace: {{ include "agentevals.namespace" . }}
+  labels:
+    {{- include "agentevals.labels" . | nindent 4 }}
+    app.kubernetes.io/component: database
+type: Opaque
+data:
+  POSTGRES_PASSWORD: {{ "agentevals" | b64enc | quote }}
+{{- end }}
diff --git a/charts/agentevals/templates/postgresql.yaml b/charts/agentevals/templates/postgresql.yaml
new file mode 100644
index 0000000..c4e5370
--- /dev/null
+++ b/charts/agentevals/templates/postgresql.yaml
@@ -0,0 +1,142 @@
+{{- if and (eq .Values.storage.backend "postgres") .Values.database.postgres.bundled.enabled (not .Values.database.postgres.url) (not .Values.database.postgres.urlFile) }}
+{{- $pg := .Values.database.postgres.bundled }}
+{{- $fullname := include "agentevals.postgresqlServiceName" . }}
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ $fullname }}
+  namespace: {{ include "agentevals.namespace" . }}
+  labels:
+    {{- include "agentevals.labels" . | nindent 4 }}
+    app.kubernetes.io/component: database
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ $fullname }}
+  namespace: {{ include "agentevals.namespace" . }}
+  labels:
+    {{- include "agentevals.labels" . | nindent 4 }}
+    app.kubernetes.io/component: database
+spec:
+  accessModes:
+    - ReadWriteOnce
+  {{- if $pg.storageClassName }}
+  storageClassName: {{ $pg.storageClassName | quote }}
+  {{- end }}
+  resources:
+    requests:
+      storage: {{ $pg.storage | quote }}
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ $fullname }}
+  namespace: {{ include "agentevals.namespace" . }}
+  labels:
+    {{- include "agentevals.labels" . | nindent 4 }}
+    app.kubernetes.io/component: database
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      {{- include "agentevals.selectorLabels" . | nindent 6 }}
+      app.kubernetes.io/component: database
+  template:
+    metadata:
+      labels:
+        {{- include "agentevals.selectorLabels" . | nindent 8 }}
+        app.kubernetes.io/component: database
+    spec:
+      {{- with .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      serviceAccountName: {{ $fullname }}
+      securityContext:
+        fsGroup: 999
+        runAsUser: 999
+        runAsGroup: 999
+        runAsNonRoot: true
+      containers:
+        - name: postgresql
+          image: {{ include "agentevals.postgresql.image" . }}
+          imagePullPolicy: {{ $pg.image.pullPolicy }}
+          securityContext:
+            allowPrivilegeEscalation: false
+          ports:
+            - name: postgresql
+              containerPort: 5432
+              protocol: TCP
+          env:
+            - name: POSTGRES_DB
+              value: "agentevals"
+            - name: POSTGRES_USER
+              value: "agentevals"
+            - name: POSTGRES_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: {{ include "agentevals.passwordSecretName" . }}
+                  key: POSTGRES_PASSWORD
+            - name: PGDATA
+              value: /var/lib/postgresql/data/pgdata
+          livenessProbe:
+            exec:
+              command:
+                - pg_isready
+                - -U
+                - agentevals
+                - -d
+                - agentevals
+            initialDelaySeconds: 20
+            periodSeconds: 10
+            timeoutSeconds: 5
+            failureThreshold: 6
+            successThreshold: 1
+          readinessProbe:
+            exec:
+              command:
+                - pg_isready
+                - -U
+                - agentevals
+                - -d
+                - agentevals
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            timeoutSeconds: 3
+            failureThreshold: 3
+            successThreshold: 1
+          {{- with $pg.resources }}
+          resources:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          volumeMounts:
+            - name: data
+              mountPath: /var/lib/postgresql/data
+      volumes:
+        - name: data
+          persistentVolumeClaim:
+            claimName: {{ $fullname }}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ $fullname }}
+  namespace: {{ include "agentevals.namespace" . }}
+  labels:
+    {{- include "agentevals.labels" . | nindent 4 }}
+    app.kubernetes.io/component: database
+spec:
+  type: ClusterIP
+  ports:
+    - name: postgresql
+      port: 5432
+      targetPort: postgresql
+      protocol: TCP
+  selector:
+    {{- include "agentevals.selectorLabels" . | nindent 4 }}
+    app.kubernetes.io/component: database
+{{- end }}
diff --git a/charts/agentevals/templates/service.yaml b/charts/agentevals/templates/service.yaml
index 090ff3c..f224c08 100644
--- a/charts/agentevals/templates/service.yaml
+++ b/charts/agentevals/templates/service.yaml
@@ -25,4 +25,4 @@ spec:
       targetPort: mcp
       protocol: TCP
   selector:
-    {{- include "agentevals.selectorLabels" . | nindent 4 }}
+    {{- include "agentevals.app.selectorLabels" . | nindent 4 }}
diff --git a/charts/agentevals/values.yaml b/charts/agentevals/values.yaml
index f455af3..17a3571 100644
--- a/charts/agentevals/values.yaml
+++ b/charts/agentevals/values.yaml
@@ -2,7 +2,10 @@
 # Global
 # ==============================================================================
 
-# -- Number of replicas. Only 1 is supported (no shared job state across pods).
+# -- Number of replicas. The default in-memory backend has no shared state, so
+# scale beyond 1 only when storage.backend is "postgres" (durable runs/results
+# in Postgres are safe to share across replicas via SELECT FOR UPDATE SKIP
+# LOCKED claim semantics).
 replicaCount: 1
 
 # -- Global container image registry (prepended to image.repository)
@@ -155,3 +158,67 @@ env: []
 
 # -- Extra envFrom sources (ConfigMapRef, SecretRef)
 envFrom: []
+
+# ==============================================================================
+# STORAGE
+# ==============================================================================
+
+storage:
+  # -- Storage backend. "memory" (default) keeps the developer experience
+  # zero-config: nothing persisted, restarts lose in-flight state. "postgres"
+  # enables /api/runs and persists runs + results in Postgres.
+  backend: memory
+
+# ==============================================================================
+# DATABASE CONFIGURATION
+# ==============================================================================
+# Used only when storage.backend is "postgres". Priority order (first match wins):
+#   1. database.postgres.urlFile  -- file-based DSN (workload identity friendly)
+#   2. database.postgres.url      -- literal DSN
+#   3. database.postgres.bundled  -- chart-bundled Postgres (dev/eval only)
+# If none is configured the chart fails to render.
+
+database:
+  postgres:
+    # -- External Postgres connection string.
+    # When set, takes precedence over the bundled instance regardless of
+    # database.postgres.bundled.enabled.
+    url: ""
+    # -- Path to a file containing the connection string. Takes precedence
+    # over url when set. Useful for projected workload-identity tokens.
+    urlFile: ""
+    # -- Postgres schema to use for agentevals tables.
+    schema: agentevals
+    # -- Bundled Postgres instance for development and evaluation only.
+    # Not suitable for production. Deployed when enabled is true and url /
+    # urlFile are not set.
+    bundled:
+      # -- Set to true to deploy a chart-managed Postgres alongside the app.
+      # Off by default so the zero-config install stays in-memory.
+      enabled: false
+      image:
+        # -- Bundled Postgres image registry
+        registry: docker.io
+        # -- Bundled Postgres image repository (org/namespace)
+        repository: library
+        # -- Bundled Postgres image name
+        name: postgres
+        # -- Bundled Postgres image tag
+        tag: "17"
+        # -- Bundled Postgres image pull policy
+        pullPolicy: IfNotPresent
+      # -- PersistentVolumeClaim size for the bundled Postgres data
+      storage: 1Gi
+      # -- StorageClass for the PVC. Defaults to the cluster default when empty.
+      storageClassName: ""
+      # The database name, user, and password are hardcoded for the bundled
+      # instance (all: "agentevals"). This is intentional for a dev/eval
+      # setup. Switch to an external database for production.
+      # -- Resource requests/limits for the bundled Postgres container
+      resources:
+        requests:
+          cpu: 250m
+          memory: 256Mi
+        limits:
+          cpu: 500m
+          memory: 512Mi
diff --git a/pyproject.toml b/pyproject.toml
index 936938c..6e0f391 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,6 +33,9 @@ streaming = [
 openai = [
     "openai>=2.0",
 ]
+postgres = [
+    "asyncpg>=0.30.0",
+]
 
 [project.scripts]
 agentevals = "agentevals.cli:main"
@@ -43,6 +46,9 @@ artifacts = ["src/agentevals/_static/**"]
 [tool.hatch.build.targets.wheel]
 packages = ["src/agentevals"]
 
+[tool.hatch.build.targets.wheel.force-include]
+"src/agentevals/storage/postgres/migrations" = "agentevals/storage/postgres/migrations"
+
 [tool.uv.workspace]
 members = ["packages/evaluator-sdk-py"]
 
diff --git a/src/agentevals/api/app.py b/src/agentevals/api/app.py
index ec3b3dd..4790510 100644
--- a/src/agentevals/api/app.py
+++ b/src/agentevals/api/app.py
@@ -16,13 +16,20 @@
 
 from agentevals import __version__
 
+from ..run.service import RunService
+from ..run.worker import AsyncRunWorker
+from ..storage import StorageSettings, build_repos
+from ..storage.postgres.migrator import Migrator
 from ..utils.log_buffer import log_buffer
 from .debug_routes import debug_router
 from .routes import router
+from .runs_routes import runs_router
 
 if TYPE_CHECKING:
     from ..streaming.ws_server import StreamingTraceManager
 
+logger = logging.getLogger(__name__)
+
 try:
     from dotenv import load_dotenv
 
@@ -51,7 +58,39 @@ async def lifespan(app: FastAPI):
         mgr = getattr(app.state, "trace_manager", None)
         if mgr:
             mgr.start_cleanup_task()
+
+        storage_settings: StorageSettings | None = None
+        worker: AsyncRunWorker | None = None
+        try:
+            storage_settings = StorageSettings.from_env()
+        except Exception as exc:
+            logger.error("Storage configuration invalid; /api/runs will not be available: %s", exc)
+
+        if storage_settings is not None and storage_settings.backend == "postgres":
+            logger.info("Applying any pending migrations to schema '%s'", storage_settings.schema_name)
+            migrator = Migrator(
+                dsn=storage_settings.database_url or "",
+                schema=storage_settings.schema_name,
+                lock_timeout_s=storage_settings.migrate_lock_timeout_s,
+            )
+            await migrator.up()
+
+            repos = await build_repos(storage_settings)
+            app.state.storage_settings = storage_settings
+            app.state.repos = repos
+            app.state.run_service = RunService(repos.runs, repos.results)
+
+            worker = AsyncRunWorker(runs=repos.runs, results=repos.results, settings=storage_settings)
+            await worker.start()
+            app.state.run_worker = worker
+
         yield
+
+        if worker is not None:
+            await worker.stop()
+        repos = getattr(app.state, "repos", None)
+        if repos is not None:
+            await repos.close()
         if mgr:
             await mgr.shutdown()
         ae_logger.removeHandler(log_buffer)
@@ -83,6 +122,7 @@ def create_app(
 
     app.include_router(router, prefix="/api")
     app.include_router(debug_router, prefix="/api/debug")
+    app.include_router(runs_router, prefix="/api")
 
     if trace_manager is not None:
         app.state.trace_manager = trace_manager
diff --git a/src/agentevals/api/routes.py b/src/agentevals/api/routes.py
index c65b1af..7538b51 100644
--- a/src/agentevals/api/routes.py
+++ b/src/agentevals/api/routes.py
@@ -22,6 +22,7 @@
     BuiltinMetricDef,
     CodeEvaluatorDef,
     CustomEvaluatorDef,
+    EvalParams,
     EvalRunConfig,
     OpenAIEvalDef,
 )
@@ -68,6 +69,71 @@ def _camel_keys(obj: Any) -> Any:
     return obj
 
 
+def _load_eval_set_dict(path: str | None) -> dict | None:
+    """Read the uploaded eval set file back into a dict for persistence.
+
+    The on-disk file gets cleaned up with the temp dir; capturing the dict
+    here lets us store it on the run row so a future ``GET /api/runs/{id}``
+    can show what was evaluated against without re-uploading the file.
+    """
+    if not path:
+        return None
+    try:
+        with open(path) as f:
+            return json.load(f)
+    except (OSError, json.JSONDecodeError):
+        logger.warning("could not re-read eval_set file at %s for persistence", path)
+        return None
+
+
+async def _maybe_persist_evaluate_run(
+    request: Request,
+    *,
+    params: "EvalParams",
+    eval_set_dict: dict | None,
+    trace_format: str | None,
+    upload_filenames: list[str] | None,
+    run_result: "RunResult",
+) -> str | None:
+    """Persist a synchronously-completed eval as a Run + Result rows when
+    ``app.state.run_service`` is configured (i.e. ``backend=postgres``).
+
+    Returns the synthesized ``run_id`` so the caller can attach it to the
+    response (UI / SSE clients can then ``GET /api/runs/{id}/results`` to
+    pull historical context). Returns None on the memory backend so callers
+    keep their existing zero-config behavior. Errors are logged but never
+    propagated; if persistence fails the eval result is still returned to
+    the caller.
+    """
+    service = getattr(request.app.state, "run_service", None)
+    if service is None:
+        return None
+    try:
+        from ..run.service import RunService
+        from ..storage.models import RunSpec, TraceTarget
+
+        filenames = list(upload_filenames or [])
+        target = TraceTarget(
+            kind="uploaded",
+            trace_format=trace_format if trace_format in ("jaeger-json", "otlp-json") else None,
+            trace_count=len(filenames),
+            trace_files=filenames,
+        )
+        spec_payload = params.model_dump(by_alias=False)
+        spec = RunSpec(
+            approach="trace_replay",
+            target=target,
+            eval_config=spec_payload,
+            eval_set=eval_set_dict,
+        )
+        assert isinstance(service, RunService)
+        run = await service.record_completed_eval(spec=spec, params=params, run_result=run_result)
+        return str(run.run_id)
+    except Exception:
+        logger.exception("failed to persist /api/evaluate run; eval result still returned to caller")
+        return None
+
+
 router = APIRouter()
 
 _MAX_JSON_BODY_BYTES = 50 * 1024 * 1024  # 50 MB (multipart endpoints allow 10 MB per file)
@@ -434,6 +500,7 @@ async def convert_trace_files(
 
 @router.post("/evaluate", response_model=StandardResponse[RunResult])
 async def evaluate_traces(
+    request: Request,
     trace_files: list[UploadFile] = File(...),
     config: str = Form(...),
     eval_set_file: UploadFile | None = File(None),
@@ -542,6 +609,17 @@ async def evaluate_traces(
         logger.info(f"Evaluating {len(trace_paths)} trace file(s) with metrics: {metrics}")
         result = await run_evaluation(eval_config)
 
+        run_id = await _maybe_persist_evaluate_run(
+            request,
+            params=eval_config,
+            eval_set_dict=_load_eval_set_dict(eval_set_path),
+            trace_format=eval_config.trace_format,
+            upload_filenames=[tf.filename for tf in trace_files if tf.filename],
+            run_result=result,
+        )
+        if run_id:
+            result.run_id = run_id
+
         result_dict = _camel_keys(result.model_dump(by_alias=True))
         return StandardResponse(data=result_dict)
 
@@ -557,12 +635,14 @@ async def evaluate_traces(
 
 @router.post("/evaluate/stream")
 async def evaluate_traces_stream(
+    request: Request,
     trace_files: list[UploadFile] = File(...),
     config: str = Form(...),
     eval_set_file: UploadFile | None = File(None),
 ):
     """Evaluate traces with real-time progress via SSE."""
     temp_dir = tempfile.mkdtemp()
+    upload_filenames = [tf.filename for tf in trace_files if tf.filename]
 
     async def event_generator():
         try:
@@ -678,6 +758,16 @@ async def run_with_progress():
                     tag, payload = msg
 
                     if tag == "done":
+                        run_id = await _maybe_persist_evaluate_run(
+                            request,
+                            params=eval_config,
+                            eval_set_dict=_load_eval_set_dict(eval_set_path),
+                            trace_format=eval_config.trace_format,
+                            upload_filenames=upload_filenames,
+                            run_result=payload,
+                        )
+                        if run_id:
+                            payload.run_id = run_id
                         evt = SSEDoneEvent(
                             result=_camel_keys(payload.model_dump(by_alias=True)),
                         )
@@ -768,6 +858,16 @@ async def evaluate_traces_json(request: EvaluateJsonRequest, raw_request: Reques
             config=request.config,
             eval_set=eval_set,
         )
+        run_id = await _maybe_persist_evaluate_run(
+            raw_request,
+            params=request.config,
+            eval_set_dict=request.eval_set,
+            trace_format=None,
+            upload_filenames=None,
+            run_result=result,
+        )
+        if run_id:
+            result.run_id = run_id
         return StandardResponse(data=_camel_keys(result.model_dump(by_alias=True)))
     except Exception as exc:
         logger.exception("JSON evaluation failed")
@@ -827,6 +927,16 @@ async def run_with_progress():
                     tag, payload = msg
 
                     if tag == "done":
+                        run_id = await _maybe_persist_evaluate_run(
+                            raw_request,
+                            params=request.config,
+                            eval_set_dict=request.eval_set,
+                            trace_format=None,
+                            upload_filenames=None,
+                            run_result=payload,
+                        )
+                        if run_id:
+                            payload.run_id = run_id
                         evt = SSEDoneEvent(
                             result=_camel_keys(payload.model_dump(by_alias=True)),
                         )
diff --git a/src/agentevals/api/runs_routes.py b/src/agentevals/api/runs_routes.py
new file mode 100644
index 0000000..99ae71a
--- /dev/null
+++ b/src/agentevals/api/runs_routes.py
@@ -0,0 +1,114 @@
+"""HTTP router for the async run pipeline.
+
+Mounted only when ``AGENTEVALS_STORAGE_BACKEND=postgres``. Submission is
+idempotent on ``run_id``: re-posting the same id with an identical spec
+returns the persisted row; re-posting with a different spec returns
+``409 Conflict``.
+"""
+
+from __future__ import annotations
+
+import logging
+from datetime import datetime
+from uuid import UUID
+
+from fastapi import APIRouter, HTTPException, Query, Request, status
+from pydantic import ConfigDict
+from pydantic.alias_generators import to_camel
+
+from ..run.service import RunService, RunSubmitConflict
+from ..storage.models import Result, Run, RunSpec, RunStatus
+from .models import CamelModel, StandardResponse
+
+logger = logging.getLogger(__name__)
+
+runs_router = APIRouter(tags=["runs"])
+
+
+class RunRequest(CamelModel):
+    """POST body for ``/api/runs``."""
+
+    run_id: UUID | None = None
+    spec: RunSpec
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True, extra="allow")
+
+
+class RunSummary(CamelModel):
+    run_id: UUID
+    status: RunStatus
+    created_at: datetime
+
+
+def _service(request: Request) -> RunService:
+    service = getattr(request.app.state, "run_service", None)
+    if service is None:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="run service is not configured (set AGENTEVALS_STORAGE_BACKEND=postgres)",
+        )
+    return service
+
+
+@runs_router.post(
+    "/runs",
+    response_model=StandardResponse[Run],
+    status_code=status.HTTP_202_ACCEPTED,
+)
+async def submit_run(payload: RunRequest, request: Request):
+    service = _service(request)
+    try:
+        run = await service.submit(run_id=payload.run_id, spec=payload.spec)
+    except RunSubmitConflict as exc:
+        raise HTTPException(
+            status_code=status.HTTP_409_CONFLICT,
+            detail={
+                "message": "run_id already exists with a different spec",
+                "persisted": exc.persisted.model_dump(mode="json", by_alias=True),
+            },
+        ) from exc
+    return StandardResponse(data=run)
+
+
+@runs_router.get("/runs/{run_id}", response_model=StandardResponse[Run])
+async def get_run(run_id: UUID, request: Request):
+    service = _service(request)
+    run = await service.get(run_id)
+    if run is None:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"run {run_id} not found")
+    return StandardResponse(data=run)
+
+
+@runs_router.get("/runs", response_model=StandardResponse[list[Run]])
+async def list_runs(
+    request: Request,
+    status_filter: list[RunStatus] | None = Query(default=None, alias="status"),
+    limit: int = Query(default=100, ge=1, le=1000),
+    before: datetime | None = Query(default=None),
+):
+    service = _service(request)
+    runs = await service.list(status=status_filter, limit=limit, before=before)
+    return StandardResponse(data=runs)
+
+
+@runs_router.get("/runs/{run_id}/results", response_model=StandardResponse[list[Result]])
+async def list_run_results(run_id: UUID, request: Request):
+    service = _service(request)
+    run = await service.get(run_id)
+    if run is None:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"run {run_id} not found")
+    results = await service.list_results(run_id)
+    return StandardResponse(data=results)
+
+
+@runs_router.post("/runs/{run_id}/cancel", response_model=StandardResponse[Run])
+async def cancel_run(run_id: UUID, request: Request):
+    service = _service(request)
+    cancelled = await service.cancel(run_id)
+    run = await service.get(run_id)
+    if run is None:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"run {run_id} not found")
+    if not cancelled and run.status not in (RunStatus.QUEUED, RunStatus.RUNNING):
+        # Already terminal; surface that to the caller without an error.
+        return StandardResponse(data=run)
+    return StandardResponse(data=run)
diff --git a/src/agentevals/cli.py b/src/agentevals/cli.py
index 9f1c37e..666cce8 100644
--- a/src/agentevals/cli.py
+++ b/src/agentevals/cli.py
@@ -729,6 +729,147 @@ def serve(
         asyncio.run(_run_servers(host, port, otlp_http_port, otlp_grpc_port, mcp_port=mcp_port))
 
 
+# ---------------------------------------------------------------------------
+# agentevals migrate ...
+# ---------------------------------------------------------------------------
+
+
+@main.group("migrate")
+def migrate_group() -> None:
+    """Manage the Postgres schema for AGENTEVALS_STORAGE_BACKEND=postgres."""
+
+
+def _migrator_or_die() -> "object":
+    from pydantic import ValidationError
+
+    from .storage.config import StorageSettings
+    from .storage.postgres.migrator import Migrator
+
+    try:
+        settings = StorageSettings.from_env()
+    except ValidationError as exc:
+        # Extract the first inner message so CLI users see "AGENTEVALS_..." rather
+        # than the multi-line Pydantic dump.
+        first = exc.errors()[0] if exc.errors() else {"msg": str(exc)}
+        raise click.ClickException(first.get("msg", str(exc))) from exc
+    except Exception as exc:
+        raise click.ClickException(str(exc)) from exc
+    if not settings.database_url:
+        raise click.ClickException("AGENTEVALS_DATABASE_URL is required for migrations")
+    return Migrator(
+        dsn=settings.database_url,
+        schema=settings.schema_name,
+        lock_timeout_s=settings.migrate_lock_timeout_s,
+    )
+
+
+@migrate_group.command("up")
+@click.option("--dry-run", is_flag=True, help="Print which migrations would apply without executing.")
+def migrate_up(dry_run: bool) -> None:
+    """Apply all pending migrations."""
+    migrator = _migrator_or_die()
+    try:
+        applied = asyncio.run(migrator.up(dry_run=dry_run))
+    except Exception as exc:
+        raise click.ClickException(f"migration failed: {exc}") from exc
+    if not applied:
+        click.echo("Nothing to apply.")
+    else:
+        verb = "Would apply" if dry_run else "Applied"
+        for v in applied:
+            click.echo(f"{verb} {v:06d}")
+
+
+@migrate_group.command("down")
+@click.option("--steps", type=int, required=True, help="Number of migrations to roll back (>= 1).")
+@click.confirmation_option(
+    prompt="Rolling back migrations is destructive and may delete data. Continue?",
+)
+def migrate_down(steps: int) -> None:
+    """Roll back the last N migrations. Prints SQL for each step before executing."""
+    migrator = _migrator_or_die()
+    try:
+        rolled = asyncio.run(migrator.down(steps=steps))
+    except Exception as exc:
+        raise click.ClickException(f"rollback failed: {exc}") from exc
+    if not rolled:
+        click.echo("Nothing to roll back.")
+    else:
+        for version, name in rolled:
+            click.echo(f"Rolled back {version:06d}_{name}")
+
+
+@migrate_group.command("version")
+def migrate_version() -> None:
+    """Print the current schema version and the dirty flag."""
+    migrator = _migrator_or_die()
+    status = asyncio.run(migrator.status())
+    if status.version is None:
+        click.echo("schema not initialized (no migrations applied)")
+    else:
+        click.echo(f"version={status.version:06d} dirty={status.dirty}")
+
+
+@migrate_group.command("force")
+@click.argument("version", type=int)
+def migrate_force(version: int) -> None:
+    """Set the schema version and clear the dirty flag. Recovery only.
+
+    Use after fixing a partially-applied migration manually. Does not run any
+    SQL; only updates the schema_migrations row.
+    """
+    migrator = _migrator_or_die()
+    asyncio.run(migrator.force(version))
+    click.echo(f"forced version={version:06d} dirty=False")
+
+
+@migrate_group.command("create")
+@click.argument("name")
+@click.option(
+    "--output-dir",
+    "-o",
+    type=click.Path(file_okay=False),
+    default=None,
+    help="Where to write the new files (defaults to the in-tree migrations directory).",
+)
+def migrate_create(name: str, output_dir: str | None) -> None:
+    """Generate an empty NNNNNN_<name>.up.sql + .down.sql pair."""
+    import re as _re
+    from pathlib import Path as _Path
+
+    if not _re.match(r"^[a-z0-9_]+$", name):
+        raise click.ClickException("name must match [a-z0-9_]+")
+
+    from .storage.postgres.migrator import discover_migrations
+
+    if output_dir is None:
+        repo_path = _Path(__file__).resolve().parent / "storage" / "postgres" / "migrations"
+        if not repo_path.is_dir():
+            raise click.ClickException(
+                f"migrations dir not found at {repo_path} (run 'create' from a checkout, not an installed wheel)"
+            )
+        target = repo_path
+    else:
+        target = _Path(output_dir)
+        target.mkdir(parents=True, exist_ok=True)
+
+    existing = discover_migrations()
+    next_version = (max((m.version for m in existing), default=0) + 1) if existing else 1
+    up_path = target / f"{next_version:06d}_{name}.up.sql"
+    down_path = target / f"{next_version:06d}_{name}.down.sql"
+    if up_path.exists() or down_path.exists():
+        raise click.ClickException(f"{up_path.name} or {down_path.name} already exists")
+
+    header = (
+        f"-- Migration {next_version:06d}: {name}\n"
+        "-- Once tagged in a release this file is immutable. Fix bugs by adding a NEW migration.\n\n"
+    )
+    up_path.write_text(header)
+    down_path.write_text(header)
+    click.echo(f"Created {up_path}")
+    click.echo(f"Created {down_path}")
+
+
 @main.command("mcp")
 @click.option(
     "--server-url",
diff --git a/src/agentevals/run/__init__.py b/src/agentevals/run/__init__.py
new file mode 100644
index 0000000..1a0e482
--- /dev/null
+++ b/src/agentevals/run/__init__.py
@@ -0,0 +1,9 @@
+"""Async run pipeline for ``POST /api/runs``.
+
+Contents:
+- :mod:`fetcher` resolves a run spec's ``target`` into a list of traces.
+- :mod:`sinks` fan-out result delivery (stdout, file, http_webhook).
+- :mod:`service` is the synchronous control surface used by HTTP handlers.
+- :mod:`worker` is the in-process loop that claims runs and drives the
+  existing :func:`agentevals.runner.run_evaluation_from_traces` pipeline.
+"""
diff --git a/src/agentevals/run/fetcher.py b/src/agentevals/run/fetcher.py
new file mode 100644
index 0000000..34f8bae
--- /dev/null
+++ b/src/agentevals/run/fetcher.py
@@ -0,0 +1,83 @@
+"""Trace fetchers — resolve a run spec's ``target`` into a list of Trace objects.
+
+Two implementations ship: ``inline`` (the JSON payload is embedded in the
+spec) and ``http`` (the worker GETs ``{base_url}/{trace_id}`` with headers
+sourced from ``context.headers``). Auth headers are pass-through; this layer
+does not validate them.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import tempfile
+from pathlib import Path
+from typing import Protocol
+
+import httpx
+
+from ..loader import load_traces
+from ..loader.base import Trace
+from ..storage.models import TraceTarget
+
+logger = logging.getLogger(__name__)
+
+
+class TraceFetcher(Protocol):
+    async def fetch(self, target: TraceTarget, context: dict) -> list[Trace]: ...
+
+
+class InlineTraceFetcher:
+    """Materializes inline JSON to a temp file and parses it via the existing loader.
+
+    The temp file dance reuses :func:`agentevals.loader.load_traces` (which
+    auto-detects format) without a special-case in the loader for dict input.
+    """
+
+    async def fetch(self, target: TraceTarget, context: dict) -> list[Trace]:
+        if not target.inline:
+            raise ValueError("InlineTraceFetcher requires target.inline to be set")
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json.dump(target.inline, f)
+            path = Path(f.name)
+        try:
+            return load_traces(str(path), format=target.trace_format)
+        finally:
+            path.unlink(missing_ok=True)  # noqa: ASYNC240
+
+
+class HttpTraceFetcher:
+    """Fetches the trace JSON over HTTP. Auth is opaque header pass-through."""
+
+    def __init__(self, timeout_s: float = 30.0) -> None:
+        self._timeout_s = timeout_s
+
+    async def fetch(self, target: TraceTarget, context: dict) -> list[Trace]:
+        if not target.base_url or not target.trace_id:
+            raise ValueError("HttpTraceFetcher requires target.base_url and target.trace_id")
+        url = target.base_url.rstrip("/") + "/" + target.trace_id
+        headers = (context.get("headers") if isinstance(context, dict) else {}) or {}
+        async with httpx.AsyncClient(timeout=self._timeout_s) as client:
+            resp = await client.get(url, headers=headers)
+            resp.raise_for_status()
+            payload = resp.json()
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json.dump(payload, f)
+            path = Path(f.name)
+        try:
+            return load_traces(str(path), format=target.trace_format)
+        finally:
+            path.unlink(missing_ok=True)  # noqa: ASYNC240
+
+
+def resolve_fetcher(target: TraceTarget) -> TraceFetcher:
+    if target.kind == "inline":
+        return InlineTraceFetcher()
+    if target.kind == "http":
+        return HttpTraceFetcher()
+    if target.kind == "uploaded":
+        raise ValueError(
+            "target kind 'uploaded' records a synchronous /api/evaluate call and cannot be "
+            "re-executed by the worker; the run already completed at submission time"
+        )
+    raise ValueError(f"unknown trace target kind '{target.kind}'")
diff --git a/src/agentevals/run/result_builder.py b/src/agentevals/run/result_builder.py
new file mode 100644
index 0000000..5d30d74
--- /dev/null
+++ b/src/agentevals/run/result_builder.py
@@ -0,0 +1,82 @@
+"""Shared helpers that project a :class:`agentevals.runner.RunResult` onto
+the persisted shapes (:class:`agentevals.storage.models.Result` rows + a
+JSON ``summary`` blob).
+
+Used both by the async worker (when a queued run finishes) and by the
+``/api/evaluate`` route handler (when a synchronous UI upload finishes), so
+both paths produce identical persisted shapes.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Literal
+from uuid import UUID
+
+from ..config import EvalParams
+from ..runner import RunResult
+from ..storage.models import Result
+
+EvaluatorType = Literal["builtin", "code", "remote", "openai_eval"]
+
+
+def classify_evaluator(metric_name: str, params: EvalParams) -> EvaluatorType:
+    """Look up whether a metric was a built-in or a custom evaluator,
+    falling back to ``builtin`` so unknown names round-trip cleanly rather
+    than raising during persistence."""
+    for ce in params.custom_evaluators:
+        if ce.name == metric_name:
+            return ce.type
+    return "builtin"
+
+
+def build_results(run_id: UUID, params: EvalParams, run_result: RunResult) -> list[Result]:
+    """Flatten ``run_result.trace_results[*].metric_results[*]`` into a list
+    of persistable :class:`Result` rows.
+
+    The ``eval_set_item_id`` and ``eval_set_item_name`` both default to the
+    trace_id, since OSS doesn't currently extract a stable per-eval-case
+    identifier from the ADK :class:`EvalSet`. Callers may post-process to
+    attach their own identifiers.
+    """
+    out: list[Result] = []
+    for trace_result in run_result.trace_results:
+        item_id = trace_result.trace_id
+        item_name = trace_result.trace_id
+        for mr in trace_result.metric_results:
+            out.append(
+                Result.from_metric_result(
+                    run_id=run_id,
+                    eval_set_item_id=item_id,
+                    eval_set_item_name=item_name,
+                    trace_id=trace_result.trace_id,
+                    evaluator_type=classify_evaluator(mr.metric_name, params),
+                    metric_result=mr,
+                )
+            )
+    return out
+
+
+def summarize_run_result(run_result: RunResult) -> dict[str, Any]:
+    """Summary blob persisted alongside the run row.
+
+    Counts mirror :class:`agentevals.storage.models.ResultStatus` values so a
+    caller polling ``GET /api/runs/{id}`` can compute pass/fail rates without
+    fetching the full result list.
+    """
+    counts = {"passed": 0, "failed": 0, "errored": 0, "skipped": 0}
+    for tr in run_result.trace_results:
+        for mr in tr.metric_results:
+            if mr.error:
+                counts["errored"] += 1
+            elif (mr.eval_status or "").upper() == "PASSED":
+                counts["passed"] += 1
+            elif (mr.eval_status or "").upper() == "FAILED":
+                counts["failed"] += 1
+            else:
+                counts["skipped"] += 1
+    return {
+        "trace_count": len(run_result.trace_results),
+        "result_counts": counts,
+        "errors": list(run_result.errors),
+        "performance_metrics": run_result.performance_metrics,
+    }
diff --git a/src/agentevals/run/service.py b/src/agentevals/run/service.py
new file mode 100644
index 0000000..e0b3f36
--- /dev/null
+++ b/src/agentevals/run/service.py
@@ -0,0 +1,127 @@
+"""Synchronous control surface used by ``/api/runs`` HTTP handlers.
+
+Wraps the :class:`agentevals.storage.repos.RunRepository` with submit
+idempotency, list pagination, and the 409 spec-mismatch path.
+
+Also provides :meth:`RunService.record_completed_eval` for the
+``/api/evaluate`` path: that handler executes synchronously (the trace was
+already supplied as multipart and the result is being streamed back over
+SSE), so we synthesize a Run row for visibility in run history rather than
+queueing work for the worker.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from datetime import datetime, timezone
+from uuid import UUID, uuid4
+
+from ..config import EvalParams
+from ..runner import RunResult
+from ..storage.models import Run, RunSpec, RunStatus
+from ..storage.repos import ResultRepository, RunRepository
+from .result_builder import build_results, summarize_run_result
+
+logger = logging.getLogger(__name__)
+
+
+def _now() -> datetime:
+    return datetime.now(timezone.utc)
+
+
+class RunSubmitConflict(Exception):
+    """Raised when a re-submission's spec differs from the persisted one.
+
+    The caller (HTTP handler) maps this to ``409 Conflict`` and returns the
+    persisted run so the client can reconcile.
+    """
+
+    def __init__(self, persisted: Run) -> None:
+        super().__init__(f"run {persisted.run_id} already exists with a different spec")
+        self.persisted = persisted
+
+
+class RunService:
+    def __init__(self, runs: RunRepository, results: ResultRepository) -> None:
+        self._runs = runs
+        self._results = results
+
+    async def submit(self, *, run_id: UUID | None, spec: RunSpec) -> Run:
+        run = Run(
+            run_id=run_id or uuid4(),
+            status=RunStatus.QUEUED,
+            spec=spec,
+        )
+        persisted = await self._runs.create(run)
+        if persisted.run_id == run.run_id and not _specs_equal(persisted.spec, spec):
+            raise RunSubmitConflict(persisted)
+        return persisted
+
+    async def get(self, run_id: UUID) -> Run | None:
+        return await self._runs.get(run_id)
+
+    async def list(
+        self,
+        *,
+        status: list[RunStatus] | None = None,
+        limit: int = 100,
+        before: datetime | None = None,
+    ) -> list[Run]:
+        return await self._runs.list(status=status, limit=limit, before=before)
+
+    async def list_results(self, run_id: UUID):
+        return await self._results.list_by_run(run_id)
+
+    async def cancel(self, run_id: UUID) -> bool:
+        return await self._runs.cancel(run_id)
+
+    async def record_completed_eval(
+        self,
+        *,
+        spec: RunSpec,
+        params: EvalParams,
+        run_result: RunResult,
+    ) -> Run:
+        """Persist a synchronously-completed eval as a Run row plus Result rows.
+
+        The run is created already in ``running`` state (so the row passes the
+        ``run_running_has_worker`` check is sidestepped via a synthetic worker
+        id), then transitioned to a terminal state in the same call. Two
+        writes per eval, but using the public :class:`RunRepository` API
+        avoids leaking an executor-only schema requirement into this layer.
+        """
+        run_id = uuid4()
+        worker_id = "sync:/api/evaluate"
+        run = Run(
+            run_id=run_id,
+            status=RunStatus.QUEUED,
+            spec=spec,
+            attempt=1,
+            worker_id=worker_id,
+            started_at=_now(),
+        )
+        await self._runs.create(run)
+
+        results = build_results(run_id, params, run_result)
+        await self._results.upsert_many(run_id, results)
+
+        summary = summarize_run_result(run_result)
+        if run_result.errors:
+            error = "; ".join(run_result.errors[:3])
+            await self._runs.update_status(run_id, RunStatus.FAILED, error=error, summary=summary)
+            run.status = RunStatus.FAILED
+            run.error = error
+        else:
+            await self._runs.update_status(run_id, RunStatus.SUCCEEDED, summary=summary)
+            run.status = RunStatus.SUCCEEDED
+        run.summary = summary
+        return run
+
+
+def _specs_equal(a: RunSpec, b: RunSpec) -> bool:
+    """Deep equality on the JSON projection. Pydantic equality compares model
+    instances by class identity, which trips up the round-trip from JSONB."""
+    return json.dumps(a.model_dump(by_alias=False), sort_keys=True) == json.dumps(
+        b.model_dump(by_alias=False), sort_keys=True
+    )
diff --git a/src/agentevals/run/sinks.py b/src/agentevals/run/sinks.py
new file mode 100644
index 0000000..d12eac1
--- /dev/null
+++ b/src/agentevals/run/sinks.py
@@ -0,0 +1,230 @@
+"""Result sinks — best-effort fan-out of run results.
+
+The :class:`agentevals.storage.repos.ResultRepository` is always written;
+sinks are an additional delivery channel. Sink failures are logged with
+``run_id`` / ``result_id`` but do not fail the run.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Any, Protocol
+from uuid import UUID
+
+import httpx
+
+from ..storage.models import Result
+
+logger = logging.getLogger(__name__)
+
+
+class ResultSink(Protocol):
+    async def emit_partial(self, run_id: UUID, results: list[Result], attempt: int) -> None: ...
+    async def emit_final(self, run_id: UUID, summary: dict, attempt: int) -> None: ...
+    async def emit_error(self, run_id: UUID, error: str, attempt: int) -> None: ...
+
+
+def _result_payload(r: Result) -> dict:
+    return r.model_dump(mode="json", by_alias=True)
+
+
+class StdoutSink:
+    async def emit_partial(self, run_id: UUID, results: list[Result], attempt: int) -> None:
+        for r in results:
+            sys.stdout.write(
+                json.dumps({"phase": "partial", "run_id": str(run_id), "result": _result_payload(r)}) + "\n"
+            )
+        sys.stdout.flush()
+
+    async def emit_final(self, run_id: UUID, summary: dict, attempt: int) -> None:
+        sys.stdout.write(json.dumps({"phase": "final", "run_id": str(run_id), "summary": summary}) + "\n")
+        sys.stdout.flush()
+
+    async def emit_error(self, run_id: UUID, error: str, attempt: int) -> None:
+        sys.stdout.write(json.dumps({"phase": "error", "run_id": str(run_id), "error": error}) + "\n")
+        sys.stdout.flush()
+
+
+class FileSink:
+    """Append-only newline-delimited JSON. Each event is one line."""
+
+    def __init__(self, path: str | Path) -> None:
+        self._path = Path(path)
+        self._lock = asyncio.Lock()
+
+    async def _write(self, payload: dict) -> None:
+        async with self._lock:
+            self._path.parent.mkdir(parents=True, exist_ok=True)
+            with self._path.open("a") as f:  # noqa: ASYNC230
+                f.write(json.dumps(payload) + "\n")
+
+    async def emit_partial(self, run_id: UUID, results: list[Result], attempt: int) -> None:
+        for r in results:
+            await self._write({"phase": "partial", "run_id": str(run_id), "result": _result_payload(r)})
+
+    async def emit_final(self, run_id: UUID, summary: dict, attempt: int) -> None:
+        await self._write({"phase": "final", "run_id": str(run_id), "summary": summary})
+
+    async def emit_error(self, run_id: UUID, error: str, attempt: int) -> None:
+        await self._write({"phase": "error", "run_id": str(run_id), "error": error})
+
+
+class HttpWebhookSink:
+    """POST JSON to a URL with retries.
+
+    Auth headers come from the spec via ``headers`` (literal values) or
+    ``headers_from_env`` (env var names whose values are read at emit time).
+    Reading at emit time means a host can rotate the env var without
+    restarting agentevals.
+    """
+
+    def __init__(
+        self,
+        url: str,
+        *,
+        headers: dict[str, str] | None = None,
+        headers_from_env: dict[str, str] | None = None,
+        timeout_s: float = 10.0,
+        max_attempts: int = 5,
+    ) -> None:
+        self._url = url
+        self._headers = headers or {}
+        self._headers_from_env = headers_from_env or {}
+        self._timeout_s = timeout_s
+        self._max_attempts = max_attempts
+
+    def _resolve_headers(self) -> dict[str, str]:
+        merged = dict(self._headers)
+        for header, env_var in self._headers_from_env.items():
+            value = os.environ.get(env_var)
+            if value is not None:
+                merged[header] = value
+        merged.setdefault("Content-Type", "application/json")
+        return merged
+
+    async def _post(self, payload: dict) -> None:
+        delay = 0.5
+        last_exc: Exception | None = None
+        for attempt in range(1, self._max_attempts + 1):
+            try:
+                async with httpx.AsyncClient(timeout=self._timeout_s) as client:
+                    resp = await client.post(self._url, json=payload, headers=self._resolve_headers())
+                if resp.status_code < 500:
+                    if resp.status_code >= 400:
+                        logger.warning(
+                            "Webhook %s returned %d: %s (run_id=%s)",
+                            self._url,
+                            resp.status_code,
+                            resp.text[:200],
+                            payload.get("run_id"),
+                        )
+                    return
+                last_exc = RuntimeError(f"HTTP {resp.status_code}: {resp.text[:200]}")
+            except (httpx.HTTPError, RuntimeError) as exc:
+                last_exc = exc
+            if attempt < self._max_attempts:
+                await asyncio.sleep(delay)
+                delay = min(delay * 2, 10.0)
+        logger.error(
+            "Webhook %s failed after %d attempts: %s (run_id=%s)",
+            self._url,
+            self._max_attempts,
+            last_exc,
+            payload.get("run_id"),
+        )
+
+    async def emit_partial(self, run_id: UUID, results: list[Result], attempt: int) -> None:
+        await self._post(
+            {
+                "phase": "partial",
+                "run_id": str(run_id),
+                "attempt": attempt,
+                "results": [_result_payload(r) for r in results],
+            }
+        )
+
+    async def emit_final(self, run_id: UUID, summary: dict, attempt: int) -> None:
+        await self._post({"phase": "final", "run_id": str(run_id), "attempt": attempt, "summary": summary})
+
+    async def emit_error(self, run_id: UUID, error: str, attempt: int) -> None:
+        await self._post({"phase": "error", "run_id": str(run_id), "attempt": attempt, "error": error})
+
+
+class SinkFanout:
+    """Runs sinks in parallel. Failures are isolated per sink."""
+
+    def __init__(self, sinks: list[ResultSink]) -> None:
+        self._sinks = sinks
+
+    async def emit_partial(self, run_id: UUID, results: list[Result], attempt: int) -> None:
+        await asyncio.gather(
+            *(self._guard(s.emit_partial(run_id, results, attempt), "partial") for s in self._sinks),
+            return_exceptions=False,
+        )
+
+    async def emit_final(self, run_id: UUID, summary: dict, attempt: int) -> None:
+        await asyncio.gather(
+            *(self._guard(s.emit_final(run_id, summary, attempt), "final") for s in self._sinks),
+            return_exceptions=False,
+        )
+
+    async def emit_error(self, run_id: UUID, error: str, attempt: int) -> None:
+        await asyncio.gather(
+            *(self._guard(s.emit_error(run_id, error, attempt), "error") for s in self._sinks),
+            return_exceptions=False,
+        )
+
+    @staticmethod
+    async def _guard(coro: Any, phase: str) -> None:
+        try:
+            await coro
+        except Exception:
+            logger.exception("sink delivery failed in phase=%s", phase)
+
+
+def build_sinks(specs: list[dict]) -> SinkFanout:
+    """Construct a fan-out from the run spec's ``sinks`` array.
+
+    Each spec is a dict with ``kind`` plus kind-specific args. Unknown kinds
+    are skipped with a warning so a future kind added by a host doesn't
+    break older agentevals replicas mid-rollout.
+    """
+    sinks: list[ResultSink] = []
+    for spec in specs:
+        kind = spec.get("kind")
+        if kind == "stdout":
+            sinks.append(StdoutSink())
+        elif kind == "file":
+            sinks.append(FileSink(spec["path"]))
+        elif kind == "http_webhook":
+            sinks.append(
+                HttpWebhookSink(
+                    url=spec["url"],
+                    headers=spec.get("headers"),
+                    headers_from_env=spec.get("headers_from_env") or _extract_env_headers(spec.get("auth")),
+                    timeout_s=float(spec.get("timeout_s", 10.0)),
+                    max_attempts=int(spec.get("max_attempts", 5)),
+                )
+            )
+        else:
+            logger.warning("unknown sink kind '%s'; skipping", kind)
+    return SinkFanout(sinks)
+
+
+def _extract_env_headers(auth: Any) -> dict[str, str]:
+    """Map the design-doc shape ``auth.headers.<name>.from_env`` to env-var lookups."""
+    result: dict[str, str] = {}
+    if not isinstance(auth, dict):
+        return result
+    headers = auth.get("headers") if auth.get("kind") == "headers" else None
+    if not isinstance(headers, dict):
+        return result
+    for header_name, value in headers.items():
+        if isinstance(value, dict) and "from_env" in value:
+            result[header_name] = value["from_env"]
+    return result
diff --git a/src/agentevals/run/worker.py b/src/agentevals/run/worker.py
new file mode 100644
index 0000000..0f2562d
--- /dev/null
+++ b/src/agentevals/run/worker.py
@@ -0,0 +1,188 @@
+"""Async run worker.
+
+A pool of asyncio tasks each loop on ``run_repo.claim_next``, heartbeat the
+lease while executing, and drive the existing
+:func:`agentevals.runner.run_evaluation_from_traces` pipeline.
+
+Cancellation is signaled by setting ``run.cancel_requested`` via
+``POST /api/runs/{id}/cancel``. The heartbeat task observes the flag on each
+tick and cancels the worker task; the worker catches and finalizes the run
+as ``cancelled``.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import socket
+from datetime import datetime, timedelta, timezone
+from uuid import UUID
+
+from google.adk.evaluation.eval_set import EvalSet
+
+from ..config import EvalParams
+from ..runner import RunResult, TraceResult, run_evaluation_from_traces
+from ..storage.config import StorageSettings
+from ..storage.models import Run, RunStatus
+from ..storage.repos import ResultRepository, RunRepository
+from .fetcher import resolve_fetcher
+from .result_builder import build_results, summarize_run_result
+from .sinks import SinkFanout, build_sinks
+
+logger = logging.getLogger(__name__)
+
+
+def _now() -> datetime:
+    return datetime.now(timezone.utc)
+
+
+class _CancelledByRequest(Exception):
+    """Raised inside the worker task when the heartbeat observes cancel_requested."""
+
+
+class AsyncRunWorker:
+    """Manages the worker task pool. ``start()`` spawns N loops; ``stop()``
+    cancels them and waits for graceful shutdown."""
+
+    def __init__(
+        self,
+        *,
+        runs: RunRepository,
+        results: ResultRepository,
+        settings: StorageSettings,
+    ) -> None:
+        self._runs = runs
+        self._results = results
+        self._settings = settings
+        self._tasks: list[asyncio.Task] = []
+        self._stopping = asyncio.Event()
+        self._worker_id_prefix = f"{socket.gethostname()}/{id(self):x}"
+
+    async def start(self) -> None:
+        self._stopping.clear()
+        for i in range(self._settings.max_concurrent_runs):
+            wid = f"{self._worker_id_prefix}/{i}"
+            self._tasks.append(asyncio.create_task(self._loop(wid), name=f"agentevals-worker-{i}"))
+        logger.info(
+            "Started %d run worker(s) (lease=%ds, heartbeat=%ds, deadline=%ds)",
+            self._settings.max_concurrent_runs,
+            self._settings.lease_s,
+            self._settings.heartbeat_s,
+            self._settings.run_deadline_s,
+        )
+
+    async def stop(self) -> None:
+        self._stopping.set()
+        for t in self._tasks:
+            t.cancel()
+        if self._tasks:
+            await asyncio.gather(*self._tasks, return_exceptions=True)
+        self._tasks.clear()
+        logger.info("Run workers stopped")
+
+    async def _loop(self, worker_id: str) -> None:
+        lease = timedelta(seconds=self._settings.lease_s)
+        poll = self._settings.worker_poll_interval_s
+        while not self._stopping.is_set():
+            try:
+                run = await self._runs.claim_next(
+                    worker_id=worker_id,
+                    lease=lease,
+                    max_attempts=self._settings.max_run_attempts,
+                )
+            except asyncio.CancelledError:
+                return
+            except Exception:
+                logger.exception("claim_next failed; backing off")
+                await asyncio.sleep(min(poll * 5, 30.0))
+                continue
+
+            if run is None:
+                try:
+                    await asyncio.sleep(poll)
+                except asyncio.CancelledError:
+                    return
+                continue
+
+            await self._execute(run, worker_id)
+
+    async def _execute(self, run: Run, worker_id: str) -> None:
+        logger.info("worker=%s claimed run=%s (attempt=%d)", worker_id, run.run_id, run.attempt)
+        cancel_event = asyncio.Event()
+        hb_task = asyncio.create_task(self._heartbeat(run.run_id, worker_id, cancel_event))
+        sinks = build_sinks(run.spec.sinks or [])
+        try:
+            await self._run_evaluation(run, sinks, cancel_event)
+        except asyncio.CancelledError:
+            await self._runs.update_status(run.run_id, RunStatus.CANCELLED, error="worker cancelled")
+            await sinks.emit_error(run.run_id, "worker cancelled", run.attempt)
+            raise
+        except _CancelledByRequest:
+            logger.info("run=%s cancelled by request", run.run_id)
+            await self._runs.update_status(run.run_id, RunStatus.CANCELLED, error="cancelled by request")
+            await sinks.emit_error(run.run_id, "cancelled by request", run.attempt)
+        except TimeoutError:
+            logger.warning("run=%s exceeded deadline of %ds", run.run_id, self._settings.run_deadline_s)
+            await self._runs.update_status(run.run_id, RunStatus.FAILED, error="deadline_exceeded")
+            await sinks.emit_error(run.run_id, "deadline_exceeded", run.attempt)
+        except Exception as exc:
+            logger.exception("run=%s failed", run.run_id)
+            await self._runs.update_status(run.run_id, RunStatus.FAILED, error=str(exc))
+            await sinks.emit_error(run.run_id, str(exc), run.attempt)
+        finally:
+            hb_task.cancel()
+            try:
+                await hb_task
+            except (asyncio.CancelledError, Exception):
+                pass
+
+    async def _run_evaluation(self, run: Run, sinks: SinkFanout, cancel_event: asyncio.Event) -> None:
+        params = EvalParams.model_validate(run.spec.eval_config or {})
+        eval_set: EvalSet | None = None
+        if run.spec.eval_set:
+            eval_set = EvalSet.model_validate(run.spec.eval_set)
+
+        fetcher = resolve_fetcher(run.spec.target)
+
+        async def _trace_progress(trace_result: TraceResult) -> None:
+            partial = build_results(run.run_id, params, RunResult(trace_results=[trace_result]))
+            await self._results.upsert_many(run.run_id, partial)
+            await sinks.emit_partial(run.run_id, partial, run.attempt)
+            if cancel_event.is_set():
+                raise _CancelledByRequest()
+
+        async with asyncio.timeout(self._settings.run_deadline_s):
+            traces = await fetcher.fetch(run.spec.target, run.spec.context)
+            if cancel_event.is_set():
+                raise _CancelledByRequest()
+            run_result = await run_evaluation_from_traces(
+                traces=traces,
+                config=params,
+                eval_set=eval_set,
+                trace_progress_callback=_trace_progress,
+            )
+
+        results = build_results(run.run_id, params, run_result)
+        await self._results.upsert_many(run.run_id, results)
+        summary = summarize_run_result(run_result)
+        await sinks.emit_final(run.run_id, summary, run.attempt)
+        await self._runs.update_status(run.run_id, RunStatus.SUCCEEDED, summary=summary)
+        logger.info(
+            "run=%s succeeded (traces=%d, results=%d)",
+            run.run_id,
+            len(run_result.trace_results),
+            len(results),
+        )
+
+    async def _heartbeat(self, run_id: UUID, worker_id: str, cancel_event: asyncio.Event) -> None:
+        lease = timedelta(seconds=self._settings.lease_s)
+        interval = self._settings.heartbeat_s
+        try:
+            while True:
+                await asyncio.sleep(interval)
+                alive = await self._runs.heartbeat(run_id, worker_id, lease)
+                if not alive:
+                    cancel_event.set()
+                    return
+        except asyncio.CancelledError:
+            return
diff --git a/src/agentevals/runner.py b/src/agentevals/runner.py
index 7b5c5fc..5e4f634 100644
--- a/src/agentevals/runner.py
+++ b/src/agentevals/runner.py
@@ -59,6 +59,7 @@ class RunResult(BaseModel):
     trace_results: list[TraceResult] = Field(default_factory=list)
     errors: list[str] = Field(default_factory=list)
     performance_metrics: dict[str, Any] | None = None
+    run_id: str | None = None
 
 
 def load_eval_set(path: str) -> EvalSet:
diff --git a/src/agentevals/storage/__init__.py b/src/agentevals/storage/__init__.py
new file mode 100644
index 0000000..49a35b5
--- /dev/null
+++ b/src/agentevals/storage/__init__.py
@@ -0,0 +1,48 @@
+"""Storage abstractions for agentevals.
+
+Two backends ship: ``memory`` (default, preserves zero-config developer
+experience) and ``postgres`` (durable runs/results, enables ``/api/runs``).
+
+The public surface is :class:`Repos`, a small bundle of repository
+implementations selected by :class:`StorageSettings.backend`.
+"""
+
+from __future__ import annotations
+
+from .config import StorageSettings
+from .models import Result, ResultStatus, Run, RunSpec, RunStatus, TraceTarget
+from .repos import Repos, ResultRepository, RunRepository, SessionRepository
+
+__all__ = [
+    "Repos",
+    "Result",
+    "ResultRepository",
+    "ResultStatus",
+    "Run",
+    "RunRepository",
+    "RunSpec",
+    "RunStatus",
+    "SessionRepository",
+    "StorageSettings",
+    "TraceTarget",
+    "build_repos",
+]
+
+
+async def build_repos(settings: StorageSettings) -> Repos:
+    """Construct the repository bundle for ``settings.backend``.
+
+    Memory backend instantiates dict-backed repos eagerly. Postgres backend
+    creates an asyncpg pool, applies pending migrations, then wires repos
+    against that pool.
+    """
+    if settings.backend == "memory":
+        from .repos.memory import MemoryRepos
+
+        return MemoryRepos.create()
+
+    from .postgres.pool import create_pool
+    from .repos.postgres import PostgresRepos
+
+    pool = await create_pool(settings)
+    return await PostgresRepos.create(pool=pool, schema=settings.schema_name)
diff --git a/src/agentevals/storage/config.py b/src/agentevals/storage/config.py
new file mode 100644
index 0000000..62beb77
--- /dev/null
+++ b/src/agentevals/storage/config.py
@@ -0,0 +1,76 @@
+"""Storage configuration loaded from AGENTEVALS_* env vars."""
+
+from __future__ import annotations
+
+import os
+from typing import Literal
+
+from pydantic import BaseModel, Field, field_validator
+
+Backend = Literal["memory", "postgres"]
+
+
+class StorageSettings(BaseModel):
+    """Runtime storage knobs.
+
+    Read from environment in :meth:`from_env`. Defaults preserve the
+    pre-existing in-memory developer experience: no Postgres required, no
+    ``/api/runs`` endpoints registered.
+    """
+
+    backend: Backend = "memory"
+    database_url: str | None = None
+    schema_name: str = "agentevals"
+    migrate_lock_timeout_s: int = 60
+
+    max_concurrent_runs: int = Field(default=4, ge=1)
+    run_deadline_s: int = Field(default=300, ge=1)
+    heartbeat_s: int = Field(default=5, ge=1)
+    lease_s: int = Field(default=30, ge=1)
+    max_run_attempts: int = Field(default=3, ge=1)
+    worker_poll_interval_s: float = Field(default=1.0, gt=0)
+
+    @field_validator("backend")
+    @classmethod
+    def _validate_backend(cls, v: Backend) -> Backend:
+        if v not in ("memory", "postgres"):
+            raise ValueError(f"unknown storage backend '{v}'; expected 'memory' or 'postgres'")
+        return v
+
+    def model_post_init(self, __context: object) -> None:
+        if self.lease_s <= self.heartbeat_s:
+            raise ValueError(
+                f"AGENTEVALS_LEASE_S ({self.lease_s}) must be greater than AGENTEVALS_HEARTBEAT_S ({self.heartbeat_s})"
+            )
+        if self.backend == "postgres" and not self.database_url:
+            raise ValueError("AGENTEVALS_STORAGE_BACKEND=postgres requires AGENTEVALS_DATABASE_URL")
+
+    @classmethod
+    def from_env(cls) -> StorageSettings:
+        return cls(
+            backend=os.environ.get("AGENTEVALS_STORAGE_BACKEND", "memory"),
+            database_url=_read_dsn_from_env(),
+            schema_name=os.environ.get("AGENTEVALS_DATABASE_SCHEMA", "agentevals"),
+            migrate_lock_timeout_s=int(os.environ.get("AGENTEVALS_MIGRATE_LOCK_TIMEOUT", "60")),
+            max_concurrent_runs=int(os.environ.get("AGENTEVALS_MAX_CONCURRENT_RUNS", "4")),
+            run_deadline_s=int(os.environ.get("AGENTEVALS_RUN_DEADLINE_S", "300")),
+            heartbeat_s=int(os.environ.get("AGENTEVALS_HEARTBEAT_S", "5")),
+            lease_s=int(os.environ.get("AGENTEVALS_LEASE_S", "30")),
+            max_run_attempts=int(os.environ.get("AGENTEVALS_MAX_RUN_ATTEMPTS", "3")),
+            worker_poll_interval_s=float(os.environ.get("AGENTEVALS_WORKER_POLL_INTERVAL_S", "1.0")),
+        )
+
+
+def _read_dsn_from_env() -> str | None:
+    """Return the DSN with AGENTEVALS_DATABASE_URL_FILE preferred over the
+    inline AGENTEVALS_DATABASE_URL. The file path is intended for projected
+    workload-identity tokens or other secret rotators that prefer a file
+    surface to an env var."""
+    file_path = os.environ.get("AGENTEVALS_DATABASE_URL_FILE")
+    if file_path:
+        try:
+            with open(file_path) as f:
+                return f.read().strip() or None
+        except OSError as exc:
+            raise ValueError(f"AGENTEVALS_DATABASE_URL_FILE={file_path!r} is unreadable: {exc}") from exc
+    return os.environ.get("AGENTEVALS_DATABASE_URL")
diff --git a/src/agentevals/storage/models.py b/src/agentevals/storage/models.py
new file mode 100644
index 0000000..fa38636
--- /dev/null
+++ b/src/agentevals/storage/models.py
@@ -0,0 +1,169 @@
+"""Pydantic models for persisted Run and Result rows.
+
+These shapes are the durable, host-facing contract returned by ``/api/runs``
+and emitted via :class:`ResultSink`. They are deliberately distinct from the
+in-pipeline :class:`agentevals.runner.MetricResult` so renaming the persisted
+fields (``status``, ``error_text``, ``latency_ms``) does not break the existing
+``/api/evaluate`` SSE consumers.
+"""
+
+from __future__ import annotations
+
+import hashlib
+from datetime import datetime, timezone
+from enum import Enum
+from typing import Any, Literal
+from uuid import UUID
+
+from pydantic import BaseModel, ConfigDict, Field
+from pydantic.alias_generators import to_camel
+
+
+class RunStatus(str, Enum):
+    QUEUED = "queued"
+    RUNNING = "running"
+    SUCCEEDED = "succeeded"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+
+
+class ResultStatus(str, Enum):
+    PASSED = "passed"
+    FAILED = "failed"
+    ERRORED = "errored"
+    SKIPPED = "skipped"
+
+
+def _now() -> datetime:
+    return datetime.now(timezone.utc)
+
+
+def compute_result_id(run_id: UUID | str, eval_set_item_id: str, evaluator_name: str) -> str:
+    """Canonical SHA-256 of ``{run_id}|{eval_set_item_id}|{evaluator_name}``.
+
+    Deterministic so both retried webhook posts and retried executor runs
+    deduplicate cleanly via INSERT ... ON CONFLICT (result_id) DO UPDATE.
+    """
+    run_id_str = str(run_id).lower() if isinstance(run_id, UUID) else str(run_id).lower()
+    payload = f"{run_id_str}|{eval_set_item_id}|{evaluator_name}".encode()
+    return hashlib.sha256(payload).hexdigest()
+
+
+class TraceTarget(BaseModel):
+    """Where a run gets its trace from.
+
+    Discriminated by ``kind``:
+    - ``inline``: the OTLP/Jaeger JSON dict is embedded directly in the spec.
+    - ``http``: a TraceFetcher GETs from ``base_url + "/" + trace_id`` using
+      the run's ``context.headers``.
+    - ``uploaded``: synthesis-only kind written by ``/api/evaluate`` after a
+      synchronous UI/multipart upload completes. Records ``trace_count`` and
+      ``trace_files`` for audit but the trace bytes themselves are not
+      retained, so an ``uploaded`` run cannot be re-executed by the worker.
+    """
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True, extra="allow")
+
+    kind: Literal["inline", "http", "uploaded"]
+    inline: dict[str, Any] | None = None
+    base_url: str | None = None
+    trace_id: str | None = None
+    trace_format: Literal["jaeger-json", "otlp-json"] | None = None
+    trace_count: int | None = None
+    trace_files: list[str] | None = None
+
+
+class RunSpec(BaseModel):
+    """Validated submission body. Stored verbatim in ``agentevals.run.spec``."""
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True, extra="allow")
+
+    approach: Literal["trace_replay"] = "trace_replay"
+    target: TraceTarget
+    eval_set: dict[str, Any] | None = None
+    eval_config: dict[str, Any] = Field(default_factory=dict)
+    sinks: list[dict[str, Any]] = Field(default_factory=list)
+    context: dict[str, Any] = Field(default_factory=dict)
+
+
+class Run(BaseModel):
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    run_id: UUID
+    status: RunStatus
+    spec: RunSpec
+    attempt: int = 0
+    worker_id: str | None = None
+    error: str | None = None
+    summary: dict[str, Any] | None = None
+    created_at: datetime = Field(default_factory=_now)
+    started_at: datetime | None = None
+    finished_at: datetime | None = None
+    cancel_requested: bool = False
+
+
+class Result(BaseModel):
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    result_id: str
+    run_id: UUID
+    eval_set_item_id: str
+    eval_set_item_name: str
+    evaluator_name: str
+    evaluator_type: Literal["builtin", "code", "remote", "openai_eval"]
+    status: ResultStatus
+    score: float | None = None
+    per_invocation_scores: list[float | None] = Field(default_factory=list)
+    trace_id: str | None = None
+    span_id: str | None = None
+    details: dict[str, Any] = Field(default_factory=dict)
+    error_text: str | None = None
+    tokens_used: dict[str, Any] | None = None
+    latency_ms: int | None = None
+    created_at: datetime = Field(default_factory=_now)
+
+    @classmethod
+    def from_metric_result(
+        cls,
+        *,
+        run_id: UUID,
+        eval_set_item_id: str,
+        eval_set_item_name: str,
+        trace_id: str | None,
+        evaluator_type: Literal["builtin", "code", "remote", "openai_eval"],
+        metric_result: Any,
+    ) -> Result:
+        """Project an in-pipeline MetricResult onto the persisted shape.
+
+        ADK emits ``eval_status`` strings ``PASSED`` / ``FAILED`` /
+        ``NOT_EVALUATED``; we additionally map presence of ``error`` to
+        ``errored`` so downstream consumers don't have to special-case
+        evaluator failures.
+        """
+        if metric_result.error:
+            status = ResultStatus.ERRORED
+        else:
+            raw = (metric_result.eval_status or "NOT_EVALUATED").upper()
+            status = {
+                "PASSED": ResultStatus.PASSED,
+                "FAILED": ResultStatus.FAILED,
+            }.get(raw, ResultStatus.SKIPPED)
+
+        scores: list[float | None] = list(metric_result.per_invocation_scores or [])
+        latency_ms = int(metric_result.duration_ms) if metric_result.duration_ms is not None else None
+
+        return cls(
+            result_id=compute_result_id(run_id, eval_set_item_id, metric_result.metric_name),
+            run_id=run_id,
+            eval_set_item_id=eval_set_item_id,
+            eval_set_item_name=eval_set_item_name,
+            evaluator_name=metric_result.metric_name,
+            evaluator_type=evaluator_type,
+            status=status,
+            score=metric_result.score,
+            per_invocation_scores=scores,
+            trace_id=trace_id,
+            details=metric_result.details or {},
+            error_text=metric_result.error,
+            latency_ms=latency_ms,
+        )
diff --git a/src/agentevals/storage/postgres/__init__.py b/src/agentevals/storage/postgres/__init__.py
new file mode 100644
index 0000000..3dc47d8
--- /dev/null
+++ b/src/agentevals/storage/postgres/__init__.py
@@ -0,0 +1,5 @@
+"""Postgres backend (asyncpg, no ORM).
+
+Hand-written SQL because we lean on PG-specific features (FOR UPDATE SKIP
+LOCKED, pg_try_advisory_lock, JSONB, ARRAY) that an ORM would obscure.
+"""
diff --git a/src/agentevals/storage/postgres/migrations/000001_init.down.sql b/src/agentevals/storage/postgres/migrations/000001_init.down.sql
new file mode 100644
index 0000000..131b385
--- /dev/null
+++ b/src/agentevals/storage/postgres/migrations/000001_init.down.sql
@@ -0,0 +1,5 @@
+-- WARNING: dropping the schema deletes ALL agentevals data: sessions, runs,
+-- results, and the evaluator cache. This file is invoked only by
+-- ``agentevals migrate down --steps N`` and is not safe to run in production.
+
+DROP SCHEMA IF EXISTS {schema} CASCADE;
diff --git a/src/agentevals/storage/postgres/migrations/000001_init.up.sql b/src/agentevals/storage/postgres/migrations/000001_init.up.sql
new file mode 100644
index 0000000..98bcaba
--- /dev/null
+++ b/src/agentevals/storage/postgres/migrations/000001_init.up.sql
@@ -0,0 +1,110 @@
+-- agentevals baseline schema. Immutable once tagged in a release.
+-- Schema changes go in a NEW migration file (000002_*.up.sql, etc.).
+-- The {schema} placeholder is substituted by the Python migrator at apply time.
+
+CREATE SCHEMA IF NOT EXISTS {schema};
+
+CREATE TABLE IF NOT EXISTS {schema}.session (
+    session_id   TEXT        PRIMARY KEY,
+    trace_id     TEXT        NOT NULL,
+    trace_ids    TEXT[]      NOT NULL DEFAULT '{{}}',
+    eval_set_id  TEXT,
+    source       TEXT        NOT NULL CHECK (source IN ('websocket', 'otlp', 'api')),
+    is_complete  BOOLEAN     NOT NULL DEFAULT FALSE,
+    has_root_span BOOLEAN    NOT NULL DEFAULT FALSE,
+    metadata     JSONB       NOT NULL DEFAULT '{{}}',
+    started_at   TIMESTAMPTZ NOT NULL DEFAULT now(),
+    completed_at TIMESTAMPTZ,
+    created_at   TIMESTAMPTZ NOT NULL DEFAULT now(),
+    updated_at   TIMESTAMPTZ NOT NULL DEFAULT now(),
+    expires_at   TIMESTAMPTZ
+);
+
+CREATE INDEX IF NOT EXISTS session_expires_at_idx
+    ON {schema}.session (expires_at)
+    WHERE expires_at IS NOT NULL;
+
+-- Reserved for future per-span / per-log persistence. Spans and logs stay
+-- in-process on StreamingTraceManager in this OSS slice; this table exists
+-- so a future migration can populate it without an ALTER on session.
+CREATE TABLE IF NOT EXISTS {schema}.session_event (
+    event_id    BIGINT      GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
+    session_id  TEXT        NOT NULL REFERENCES {schema}.session(session_id) ON DELETE CASCADE,
+    kind        TEXT        NOT NULL CHECK (kind IN ('span', 'log')),
+    payload     JSONB       NOT NULL,
+    received_at TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+
+CREATE INDEX IF NOT EXISTS session_event_session_id_idx
+    ON {schema}.session_event (session_id, event_id);
+
+-- Run state and work queue. claim_next() relies on the run_queue_idx for
+-- SELECT FOR UPDATE SKIP LOCKED ordering.
+CREATE TABLE IF NOT EXISTS {schema}.run (
+    run_id            UUID        PRIMARY KEY,
+    status            TEXT        NOT NULL CHECK (status IN
+                                  ('queued', 'running', 'succeeded', 'failed', 'cancelled')),
+    approach          TEXT        NOT NULL CHECK (approach IN ('trace_replay', 'agent_invoke')),
+    spec              JSONB       NOT NULL,
+    attempt           INT         NOT NULL DEFAULT 0,
+    worker_id         TEXT,
+    claimed_at        TIMESTAMPTZ,
+    lease_expires_at  TIMESTAMPTZ,
+    cancel_requested  BOOLEAN     NOT NULL DEFAULT FALSE,
+    error             TEXT,
+    summary           JSONB,
+    created_at        TIMESTAMPTZ NOT NULL DEFAULT now(),
+    started_at        TIMESTAMPTZ,
+    finished_at       TIMESTAMPTZ,
+    CONSTRAINT run_running_has_worker
+        CHECK (status <> 'running'
+               OR (worker_id IS NOT NULL
+                   AND claimed_at IS NOT NULL
+                   AND lease_expires_at IS NOT NULL))
+);
+
+CREATE INDEX IF NOT EXISTS run_queue_idx
+    ON {schema}.run (status, created_at)
+    WHERE status IN ('queued', 'running');
+
+CREATE INDEX IF NOT EXISTS run_lease_idx
+    ON {schema}.run (lease_expires_at)
+    WHERE status = 'running';
+
+CREATE TABLE IF NOT EXISTS {schema}.result (
+    result_id              TEXT               PRIMARY KEY,
+    run_id                 UUID               NOT NULL REFERENCES {schema}.run(run_id) ON DELETE CASCADE,
+    eval_set_item_id       TEXT               NOT NULL,
+    eval_set_item_name     TEXT               NOT NULL,
+    evaluator_name         TEXT               NOT NULL,
+    evaluator_type         TEXT               NOT NULL CHECK (evaluator_type IN
+                                              ('builtin', 'code', 'remote', 'openai_eval')),
+    status                 TEXT               NOT NULL CHECK (status IN
+                                              ('passed', 'failed', 'errored', 'skipped')),
+    score                  DOUBLE PRECISION,
+    per_invocation_scores  DOUBLE PRECISION[] NOT NULL DEFAULT '{{}}',
+    trace_id               TEXT,
+    span_id                TEXT,
+    details                JSONB              NOT NULL DEFAULT '{{}}',
+    error_text             TEXT,
+    tokens_used            JSONB,
+    latency_ms             INT,
+    created_at             TIMESTAMPTZ        NOT NULL DEFAULT now(),
+    expires_at             TIMESTAMPTZ
+);
+
+CREATE INDEX IF NOT EXISTS result_run_id_idx     ON {schema}.result (run_id);
+CREATE INDEX IF NOT EXISTS result_expires_at_idx ON {schema}.result (expires_at) WHERE expires_at IS NOT NULL;
+
+-- Reserved for cached evaluator code from external sources (GitHub today,
+-- additional sources later). No read/write code in this slice; included here
+-- so a future change does not require an ALTER on this table.
+CREATE TABLE IF NOT EXISTS {schema}.evaluator_cache (
+    source_name     TEXT        NOT NULL,
+    evaluator_name  TEXT        NOT NULL,
+    ref             TEXT        NOT NULL,
+    content         BYTEA       NOT NULL,
+    metadata        JSONB       NOT NULL DEFAULT '{{}}',
+    fetched_at      TIMESTAMPTZ NOT NULL DEFAULT now(),
+    PRIMARY KEY (source_name, evaluator_name, ref)
+);
diff --git a/src/agentevals/storage/postgres/migrator.py b/src/agentevals/storage/postgres/migrator.py
new file mode 100644
index 0000000..13c207b
--- /dev/null
+++ b/src/agentevals/storage/postgres/migrator.py
@@ -0,0 +1,286 @@
+"""SQL migration runner.
+
+Applies sequentially numbered migrations under
+``src/agentevals/storage/postgres/migrations/``. Holds a Postgres advisory
+lock for the duration so multi-replica installs can safely call ``migrate
+up`` from any process. The tracking table is golang-migrate compatible
+(``schema_migrations`` with ``version`` BIGINT PRIMARY KEY and ``dirty``
+BOOLEAN), so external migration tooling can adopt the same files later
+without translation.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import re
+from dataclasses import dataclass
+from importlib.resources import files
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import asyncpg
+
+logger = logging.getLogger(__name__)
+
+ADVISORY_LOCK_KEY = 7259820376655812345
+"""Fixed int8 used by pg_try_advisory_lock during migration runs.
+Chosen at random; collision-free for any sane application."""
+
+_FILE_PATTERN = re.compile(r"^(?P<version>\d{6})_(?P<name>[a-z0-9_]+)\.(?P<dir>up|down)\.sql$")
+
+
+@dataclass(frozen=True)
+class Migration:
+    version: int
+    name: str
+    up_sql: str
+    down_sql: str | None
+
+
+def _discover_migrations() -> list[Migration]:
+    """Read all NNNNNN_name.up.sql / .down.sql pairs from the package.
+
+    importlib.resources resolves correctly inside a wheel, in editable
+    installs, and from a zipped package.
+    """
+    pkg = files("agentevals.storage.postgres.migrations")
+    ups: dict[int, tuple[str, str]] = {}
+    downs: dict[int, str] = {}
+
+    for entry in pkg.iterdir():
+        match = _FILE_PATTERN.match(entry.name)
+        if not match:
+            continue
+        version = int(match.group("version"))
+        name = match.group("name")
+        sql = entry.read_text(encoding="utf-8")
+        if match.group("dir") == "up":
+            ups[version] = (name, sql)
+        else:
+            downs[version] = sql
+
+    migrations = []
+    for version in sorted(ups):
+        name, up_sql = ups[version]
+        migrations.append(Migration(version=version, name=name, up_sql=up_sql, down_sql=downs.get(version)))
+    return migrations
+
+
+def _apply_schema(sql: str, schema: str) -> str:
+    """Substitute the {schema} placeholder. Doubled braces in SQL literals
+    (``'{{}}'``) collapse back to single braces."""
+    if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", schema):
+        raise ValueError(f"invalid schema name '{schema}'; must be a SQL identifier")
+    return sql.replace("{schema}", schema).replace("{{}}", "{}")
+
+
+@dataclass
+class MigrationStatus:
+    version: int | None
+    dirty: bool
+
+
+class Migrator:
+    """Applies and rolls back migrations against a single Postgres database.
+
+    One advisory lock is held for the lifetime of any apply/rollback call so
+    concurrent migrators (multiple agentevals replicas booting at once) wait
+    rather than racing.
+    """
+
+    def __init__(self, dsn: str, schema: str = "agentevals", lock_timeout_s: int = 60) -> None:
+        self._dsn = dsn
+        self._schema = schema
+        self._lock_timeout_s = lock_timeout_s
+
+    async def _connect(self) -> "asyncpg.Connection":
+        try:
+            import asyncpg
+        except ImportError as exc:
+            raise ImportError(
+                "agentevals migrate requires the 'postgres' extra. Install with: uv sync --extra postgres"
+            ) from exc
+        return await connect_with_retry(self._dsn, asyncpg)
+
+    async def _acquire_lock(self, conn: "asyncpg.Connection") -> None:
+        deadline = asyncio.get_event_loop().time() + self._lock_timeout_s
+        attempt = 0
+        while True:
+            acquired = await conn.fetchval("SELECT pg_try_advisory_lock($1)", ADVISORY_LOCK_KEY)
+            if acquired:
+                return
+            if asyncio.get_event_loop().time() >= deadline:
+                raise TimeoutError(
+                    f"Could not acquire migration advisory lock within {self._lock_timeout_s}s. "
+                    "Another migration is likely in progress."
+                )
+            attempt += 1
+            wait = min(2.0, 0.2 * attempt)
+            logger.info("Waiting for migration lock (attempt %d, sleeping %.1fs)...", attempt, wait)
+            await asyncio.sleep(wait)
+
+    async def _release_lock(self, conn: "asyncpg.Connection") -> None:
+        await conn.execute("SELECT pg_advisory_unlock($1)", ADVISORY_LOCK_KEY)
+
+    async def _ensure_tracking_table(self, conn: "asyncpg.Connection") -> None:
+        await conn.execute(f'CREATE SCHEMA IF NOT EXISTS "{self._schema}"')
+        await conn.execute(
+            f'CREATE TABLE IF NOT EXISTS "{self._schema}".schema_migrations '
+            "(version BIGINT NOT NULL PRIMARY KEY, dirty BOOLEAN NOT NULL)"
+        )
+
+    async def _read_status(self, conn: "asyncpg.Connection") -> MigrationStatus:
+        row = await conn.fetchrow(f'SELECT version, dirty FROM "{self._schema}".schema_migrations LIMIT 1')
+        if row is None:
+            return MigrationStatus(version=None, dirty=False)
+        return MigrationStatus(version=int(row["version"]), dirty=bool(row["dirty"]))
+
+    async def _write_status(self, conn: "asyncpg.Connection", version: int | None, dirty: bool) -> None:
+        await conn.execute(f'DELETE FROM "{self._schema}".schema_migrations')
+        if version is not None:
+            await conn.execute(
+                f'INSERT INTO "{self._schema}".schema_migrations (version, dirty) VALUES ($1, $2)',
+                version,
+                dirty,
+            )
+
+    async def status(self) -> MigrationStatus:
+        conn = await self._connect()
+        try:
+            await self._ensure_tracking_table(conn)
+            return await self._read_status(conn)
+        finally:
+            await conn.close()
+
+    async def up(self, *, dry_run: bool = False) -> list[int]:
+        migrations = _discover_migrations()
+        applied: list[int] = []
+        conn = await self._connect()
+        try:
+            await self._ensure_tracking_table(conn)
+            await self._acquire_lock(conn)
+            try:
+                status = await self._read_status(conn)
+                if status.dirty:
+                    raise RuntimeError(
+                        f"schema_migrations is dirty at version {status.version}. "
+                        "Resolve manually, then run: agentevals migrate force <version>"
+                    )
+                pending = [m for m in migrations if status.version is None or m.version > status.version]
+                if not pending:
+                    logger.info("Nothing to apply (current version: %s)", status.version)
+                    return []
+                for m in pending:
+                    sql = _apply_schema(m.up_sql, self._schema)
+                    if dry_run:
+                        logger.info("Would apply migration %06d_%s", m.version, m.name)
+                        applied.append(m.version)
+                        continue
+                    logger.info("Applying migration %06d_%s", m.version, m.name)
+                    await self._write_status(conn, m.version, dirty=True)
+                    try:
+                        async with conn.transaction():
+                            await conn.execute(sql)
+                            await self._write_status(conn, m.version, dirty=False)
+                    except Exception:
+                        logger.exception("Migration %06d_%s failed; schema_migrations left dirty", m.version, m.name)
+                        raise
+                    applied.append(m.version)
+            finally:
+                await self._release_lock(conn)
+        finally:
+            await conn.close()
+        return applied
+
+    async def down(self, *, steps: int) -> list[tuple[int, str]]:
+        if steps < 1:
+            raise ValueError("steps must be >= 1")
+        migrations = _discover_migrations()
+        by_version = {m.version: m for m in migrations}
+        rolled_back: list[tuple[int, str]] = []
+        conn = await self._connect()
+        try:
+            await self._ensure_tracking_table(conn)
+            await self._acquire_lock(conn)
+            try:
+                status = await self._read_status(conn)
+                if status.dirty or status.version is None:
+                    raise RuntimeError(
+                        f"refusing to roll back from dirty/empty state (version={status.version}, dirty={status.dirty})"
+                    )
+                applied_versions = sorted((v for v in by_version if v <= status.version), reverse=True)
+                target_versions = applied_versions[:steps]
+                for version in target_versions:
+                    m = by_version[version]
+                    if not m.down_sql:
+                        raise RuntimeError(f"migration {version:06d}_{m.name} has no down.sql")
+                    sql = _apply_schema(m.down_sql, self._schema)
+                    logger.warning("Rolling back %06d_%s\n--- SQL ---\n%s\n--- end ---", m.version, m.name, sql)
+                    next_version = max((v for v in by_version if v < version), default=None)
+                    await self._write_status(conn, version, dirty=True)
+                    try:
+                        async with conn.transaction():
+                            await conn.execute(sql)
+                            await self._write_status(conn, next_version, dirty=False)
+                    except Exception:
+                        logger.exception(
+                            "Down migration %06d_%s failed; schema_migrations left dirty", m.version, m.name
+                        )
+                        raise
+                    rolled_back.append((m.version, m.name))
+                    if next_version is None:
+                        break
+            finally:
+                await self._release_lock(conn)
+        finally:
+            await conn.close()
+        return rolled_back
+
+    async def force(self, version: int) -> None:
+        conn = await self._connect()
+        try:
+            await self._ensure_tracking_table(conn)
+            await self._write_status(conn, version, dirty=False)
+        finally:
+            await conn.close()
+
+
+def discover_migrations() -> list[Migration]:
+    """Public alias for the migration discovery helper, used by ``migrate create``."""
+    return _discover_migrations()
+
+
+CONNECT_RETRY_DEADLINE_S = 60.0
+"""Total wall-clock budget for the initial Postgres connection. Bundled PG
+in Kubernetes typically takes 5-15s to be ready (PVC bind, initdb, listener
+bind), so the agentevals lifespan can race the database on a fresh deploy.
+Retrying tolerates that gap rather than failing pod startup and relying on
+CrashLoopBackOff timing to eventually line up."""
+
+
+async def connect_with_retry(dsn: str, asyncpg_module) -> "asyncpg.Connection":
+    """Open a single asyncpg connection, retrying on connection-refused or
+    server-not-ready errors for up to ``CONNECT_RETRY_DEADLINE_S`` seconds.
+
+    Connection-time errors are tolerated; once a connection has been
+    established and a query returned, all subsequent failures propagate
+    normally.
+    """
+    deadline = asyncio.get_event_loop().time() + CONNECT_RETRY_DEADLINE_S
+    delay = 0.5
+    while True:
+        try:
+            return await asyncpg_module.connect(dsn)
+        except (OSError, asyncpg_module.PostgresError) as exc:
+            now = asyncio.get_event_loop().time()
+            if now >= deadline:
+                raise
+            sleep_for = min(delay, deadline - now)
+            logger.info(
+                "Database not ready (%s); retrying in %.1fs",
+                type(exc).__name__,
+                sleep_for,
+            )
+            await asyncio.sleep(sleep_for)
+            delay = min(delay * 2, 5.0)
diff --git a/src/agentevals/storage/postgres/pool.py b/src/agentevals/storage/postgres/pool.py
new file mode 100644
index 0000000..f1446e0
--- /dev/null
+++ b/src/agentevals/storage/postgres/pool.py
@@ -0,0 +1,80 @@
+"""asyncpg pool factory.
+
+asyncpg is imported lazily so the base ``agentevals`` install (without the
+``[postgres]`` extra) does not require it.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from typing import TYPE_CHECKING
+
+from ..config import StorageSettings
+
+if TYPE_CHECKING:
+    import asyncpg
+
+logger = logging.getLogger(__name__)
+
+
+async def create_pool(settings: StorageSettings) -> "asyncpg.Pool":
+    """Build an asyncpg pool sized for the worker fan-out plus headroom.
+
+    The pool needs at least one connection per concurrent worker (claim +
+    heartbeat run on the same connection), one for the API request handlers,
+    plus a small buffer.
+
+    Pool warmup eagerly opens ``min_size`` connections, which can race with
+    Postgres readiness on a fresh deploy. We retry on connection-refused so
+    the lifespan tolerates the gap rather than crashing the pod.
+    """
+    try:
+        import asyncpg
+    except ImportError as exc:
+        raise ImportError(
+            "AGENTEVALS_STORAGE_BACKEND=postgres requires the 'postgres' extra. "
+            "Install with: uv sync --extra postgres  (or pip install 'agentevals-cli[postgres]')"
+        ) from exc
+
+    if not settings.database_url:
+        raise ValueError("AGENTEVALS_DATABASE_URL is required for postgres backend")
+
+    min_size = max(2, settings.max_concurrent_runs)
+    max_size = settings.max_concurrent_runs * 2 + 4
+
+    logger.info(
+        "Creating asyncpg pool (min=%d, max=%d) for schema '%s'",
+        min_size,
+        max_size,
+        settings.schema_name,
+    )
+
+    from .migrator import CONNECT_RETRY_DEADLINE_S
+
+    deadline = asyncio.get_event_loop().time() + CONNECT_RETRY_DEADLINE_S
+    delay = 0.5
+    while True:
+        try:
+            pool = await asyncpg.create_pool(
+                dsn=settings.database_url,
+                min_size=min_size,
+                max_size=max_size,
+                command_timeout=60,
+            )
+            break
+        except (OSError, asyncpg.PostgresError) as exc:
+            now = asyncio.get_event_loop().time()
+            if now >= deadline:
+                raise
+            sleep_for = min(delay, deadline - now)
+            logger.info(
+                "Pool warmup failed (%s); retrying in %.1fs",
+                type(exc).__name__,
+                sleep_for,
+            )
+            await asyncio.sleep(sleep_for)
+            delay = min(delay * 2, 5.0)
+    if pool is None:
+        raise RuntimeError("asyncpg.create_pool returned None")
+    return pool
diff --git a/src/agentevals/storage/repos/__init__.py b/src/agentevals/storage/repos/__init__.py
new file mode 100644
index 0000000..c267be2
--- /dev/null
+++ b/src/agentevals/storage/repos/__init__.py
@@ -0,0 +1,90 @@
+"""Repository protocols and the bundle holder used by ``/api/runs``."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from typing import Protocol
+from uuid import UUID
+
+from ..models import Result, Run, RunStatus
+
+if False:  # for type checking only — avoids circular import at runtime
+    from ...streaming.session import TraceSession
+
+
+class SessionRepository(Protocol):
+    """Tracks streaming TraceSession metadata.
+
+    Spans and logs themselves stay in-process on the StreamingTraceManager in
+    this OSS slice; only the session lifecycle row is persisted.
+    """
+
+    async def get(self, session_id: str) -> "TraceSession | None": ...
+    async def upsert(self, session: "TraceSession") -> None: ...
+    async def delete(self, session_id: str) -> None: ...
+    async def list_all(self) -> "list[TraceSession]": ...
+    async def find_by_trace_id(self, trace_id: str) -> "TraceSession | None": ...
+
+
+class RunRepository(Protocol):
+    async def create(self, run: Run) -> Run:
+        """Insert a new run. Idempotent on ``run_id`` — if a row exists with
+        the same id, returns the persisted row unchanged.
+        """
+
+    async def get(self, run_id: UUID) -> Run | None: ...
+    async def list(
+        self,
+        *,
+        status: list[RunStatus] | None = None,
+        limit: int = 100,
+        before: datetime | None = None,
+    ) -> list[Run]: ...
+    async def claim_next(self, *, worker_id: str, lease: timedelta, max_attempts: int) -> Run | None:
+        """Atomically claim a queued or lease-expired run via SELECT FOR UPDATE
+        SKIP LOCKED. Returns ``None`` if no work is available.
+        """
+
+    async def heartbeat(self, run_id: UUID, worker_id: str, lease: timedelta) -> bool:
+        """Extend the lease. Returns False if the run was cancelled or lost."""
+
+    async def update_status(
+        self,
+        run_id: UUID,
+        status: RunStatus,
+        *,
+        error: str | None = None,
+        summary: dict | None = None,
+    ) -> None: ...
+    async def cancel(self, run_id: UUID) -> bool:
+        """Mark cancel_requested=True; the worker observes on next heartbeat."""
+
+
+class ResultRepository(Protocol):
+    async def upsert_many(self, run_id: UUID, results: list[Result]) -> None:
+        """Idempotent bulk insert/update on ``result_id``."""
+
+    async def list_by_run(self, run_id: UUID) -> list[Result]: ...
+    async def delete_by_run(self, run_id: UUID) -> None: ...
+
+
+@dataclass
+class Repos:
+    """Bundle of the three repos plus a close hook for the underlying pool."""
+
+    sessions: SessionRepository
+    runs: RunRepository
+    results: ResultRepository
+    backend: str
+
+    async def close(self) -> None:
+        pass
+
+
+__all__ = [
+    "Repos",
+    "ResultRepository",
+    "RunRepository",
+    "SessionRepository",
+]
diff --git a/src/agentevals/storage/repos/memory.py b/src/agentevals/storage/repos/memory.py
new file mode 100644
index 0000000..2e9790e
--- /dev/null
+++ b/src/agentevals/storage/repos/memory.py
@@ -0,0 +1,183 @@
+"""In-process dict-backed implementations of the repository protocols.
+
+Used as the default for OSS so ``agentevals run trace.json`` and ``helm
+install agentevals`` keep working with no external dependencies. Behavior
+matches the pre-existing :class:`StreamingTraceManager.sessions` dict that
+this code replaces.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from datetime import datetime, timedelta, timezone
+from typing import TYPE_CHECKING
+from uuid import UUID
+
+from ..models import Result, Run, RunStatus
+from . import Repos, ResultRepository, RunRepository, SessionRepository
+
+if TYPE_CHECKING:
+    from ...streaming.session import TraceSession
+
+
+def _now() -> datetime:
+    return datetime.now(timezone.utc)
+
+
+class MemorySessionRepository:
+    def __init__(self) -> None:
+        self._sessions: dict[str, TraceSession] = {}
+        self._lock = asyncio.Lock()
+
+    async def get(self, session_id: str) -> TraceSession | None:
+        async with self._lock:
+            return self._sessions.get(session_id)
+
+    async def upsert(self, session: TraceSession) -> None:
+        async with self._lock:
+            self._sessions[session.session_id] = session
+
+    async def delete(self, session_id: str) -> None:
+        async with self._lock:
+            self._sessions.pop(session_id, None)
+
+    async def list_all(self) -> list[TraceSession]:
+        async with self._lock:
+            return list(self._sessions.values())
+
+    async def find_by_trace_id(self, trace_id: str) -> TraceSession | None:
+        async with self._lock:
+            for session in self._sessions.values():
+                if trace_id in session.trace_ids:
+                    return session
+            return None
+
+
+class MemoryRunRepository:
+    def __init__(self) -> None:
+        self._runs: dict[UUID, Run] = {}
+        self._lock = asyncio.Lock()
+
+    async def create(self, run: Run) -> Run:
+        async with self._lock:
+            existing = self._runs.get(run.run_id)
+            if existing is not None:
+                return existing
+            self._runs[run.run_id] = run
+            return run
+
+    async def get(self, run_id: UUID) -> Run | None:
+        async with self._lock:
+            return self._runs.get(run_id)
+
+    async def list(
+        self,
+        *,
+        status: list[RunStatus] | None = None,
+        limit: int = 100,
+        before: datetime | None = None,
+    ) -> list[Run]:
+        async with self._lock:
+            runs = list(self._runs.values())
+        runs.sort(key=lambda r: r.created_at, reverse=True)
+        if status:
+            runs = [r for r in runs if r.status in status]
+        if before:
+            runs = [r for r in runs if r.created_at < before]
+        return runs[:limit]
+
+    async def claim_next(self, *, worker_id: str, lease: timedelta, max_attempts: int) -> Run | None:
+        now = _now()
+        async with self._lock:
+            candidates = [r for r in self._runs.values() if r.status == RunStatus.QUEUED and r.attempt < max_attempts]
+            candidates.sort(key=lambda r: r.created_at)
+            if not candidates:
+                return None
+            run = candidates[0]
+            run.status = RunStatus.RUNNING
+            run.worker_id = worker_id
+            run.attempt += 1
+            run.started_at = run.started_at or now
+            return run
+
+    async def heartbeat(self, run_id: UUID, worker_id: str, lease: timedelta) -> bool:
+        async with self._lock:
+            run = self._runs.get(run_id)
+            if run is None or run.worker_id != worker_id:
+                return False
+            return not run.cancel_requested
+
+    async def update_status(
+        self,
+        run_id: UUID,
+        status: RunStatus,
+        *,
+        error: str | None = None,
+        summary: dict | None = None,
+    ) -> None:
+        async with self._lock:
+            run = self._runs.get(run_id)
+            if run is None:
+                return
+            run.status = status
+            if error is not None:
+                run.error = error
+            if summary is not None:
+                run.summary = summary
+            if status in (RunStatus.SUCCEEDED, RunStatus.FAILED, RunStatus.CANCELLED):
+                run.finished_at = _now()
+
+    async def cancel(self, run_id: UUID) -> bool:
+        async with self._lock:
+            run = self._runs.get(run_id)
+            if run is None or run.status in (RunStatus.SUCCEEDED, RunStatus.FAILED, RunStatus.CANCELLED):
+                return False
+            run.cancel_requested = True
+            if run.status == RunStatus.QUEUED:
+                run.status = RunStatus.CANCELLED
+                run.finished_at = _now()
+            return True
+
+
+class MemoryResultRepository:
+    def __init__(self) -> None:
+        self._results: dict[str, Result] = {}
+        self._by_run: dict[UUID, list[str]] = {}
+        self._lock = asyncio.Lock()
+
+    async def upsert_many(self, run_id: UUID, results: list[Result]) -> None:
+        async with self._lock:
+            for r in results:
+                self._results[r.result_id] = r
+                ids = self._by_run.setdefault(run_id, [])
+                if r.result_id not in ids:
+                    ids.append(r.result_id)
+
+    async def list_by_run(self, run_id: UUID) -> list[Result]:
+        async with self._lock:
+            ids = self._by_run.get(run_id, [])
+            return [self._results[i] for i in ids if i in self._results]
+
+    async def delete_by_run(self, run_id: UUID) -> None:
+        async with self._lock:
+            for rid in self._by_run.pop(run_id, []):
+                self._results.pop(rid, None)
+
+
+class MemoryRepos(Repos):
+    @classmethod
+    def create(cls) -> "MemoryRepos":
+        return cls(
+            sessions=MemorySessionRepository(),
+            runs=MemoryRunRepository(),
+            results=MemoryResultRepository(),
+            backend="memory",
+        )
+
+
+__all__ = [
+    "MemoryRepos",
+    "MemoryResultRepository",
+    "MemoryRunRepository",
+    "MemorySessionRepository",
+]
diff --git a/src/agentevals/storage/repos/postgres.py b/src/agentevals/storage/repos/postgres.py
new file mode 100644
index 0000000..dad4ebb
--- /dev/null
+++ b/src/agentevals/storage/repos/postgres.py
@@ -0,0 +1,389 @@
+"""asyncpg-backed repository implementations.
+
+Plain SQL, no ORM. The connection pool is created in
+``storage.postgres.pool.create_pool`` and lives on :class:`PostgresRepos`;
+each method acquires a connection from the pool for the duration of a single
+query or transaction.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from datetime import datetime, timedelta, timezone
+from typing import TYPE_CHECKING
+from uuid import UUID
+
+from ..models import Result, ResultStatus, Run, RunSpec, RunStatus
+from . import Repos
+
+if TYPE_CHECKING:
+    import asyncpg
+
+    from ...streaming.session import TraceSession
+
+logger = logging.getLogger(__name__)
+
+
+def _now() -> datetime:
+    return datetime.now(timezone.utc)
+
+
+def _row_to_session(row: "asyncpg.Record") -> "TraceSession":
+    from ...streaming.session import TraceSession
+
+    return TraceSession(
+        session_id=row["session_id"],
+        trace_id=row["trace_id"],
+        eval_set_id=row["eval_set_id"],
+        started_at=row["started_at"],
+        is_complete=row["is_complete"],
+        completed_at=row["completed_at"],
+        metadata=dict(row["metadata"]) if row["metadata"] else {},
+        source=row["source"],
+        has_root_span=row["has_root_span"],
+        trace_ids=set(row["trace_ids"] or []),
+    )
+
+
+def _row_to_run(row: "asyncpg.Record") -> Run:
+    spec_json = row["spec"]
+    spec_dict = json.loads(spec_json) if isinstance(spec_json, str) else spec_json
+    summary_json = row["summary"]
+    summary = json.loads(summary_json) if isinstance(summary_json, str) else summary_json
+    return Run(
+        run_id=row["run_id"],
+        status=RunStatus(row["status"]),
+        spec=RunSpec.model_validate(spec_dict),
+        attempt=row["attempt"],
+        worker_id=row["worker_id"],
+        error=row["error"],
+        summary=summary,
+        created_at=row["created_at"],
+        started_at=row["started_at"],
+        finished_at=row["finished_at"],
+        cancel_requested=row["cancel_requested"],
+    )
+
+
+def _row_to_result(row: "asyncpg.Record") -> Result:
+    details_json = row["details"]
+    details = json.loads(details_json) if isinstance(details_json, str) else details_json
+    tokens_json = row["tokens_used"]
+    tokens = json.loads(tokens_json) if isinstance(tokens_json, str) else tokens_json
+    return Result(
+        result_id=row["result_id"],
+        run_id=row["run_id"],
+        eval_set_item_id=row["eval_set_item_id"],
+        eval_set_item_name=row["eval_set_item_name"],
+        evaluator_name=row["evaluator_name"],
+        evaluator_type=row["evaluator_type"],
+        status=ResultStatus(row["status"]),
+        score=row["score"],
+        per_invocation_scores=list(row["per_invocation_scores"] or []),
+        trace_id=row["trace_id"],
+        span_id=row["span_id"],
+        details=dict(details) if details else {},
+        error_text=row["error_text"],
+        tokens_used=dict(tokens) if tokens else None,
+        latency_ms=row["latency_ms"],
+        created_at=row["created_at"],
+    )
+
+
+class PostgresSessionRepository:
+    def __init__(self, pool: "asyncpg.Pool", schema: str) -> None:
+        self._pool = pool
+        self._schema = schema
+
+    @property
+    def _t(self) -> str:
+        return f'"{self._schema}".session'
+
+    async def get(self, session_id: str) -> "TraceSession | None":
+        row = await self._pool.fetchrow(f"SELECT * FROM {self._t} WHERE session_id = $1", session_id)
+        return _row_to_session(row) if row else None
+
+    async def upsert(self, session: "TraceSession") -> None:
+        await self._pool.execute(
+            f"""
+            INSERT INTO {self._t}
+                (session_id, trace_id, trace_ids, eval_set_id, source, is_complete,
+                 has_root_span, metadata, started_at, completed_at, updated_at)
+            VALUES ($1, $2, $3, $4, $5, $6, $7, $8::jsonb, $9, $10, now())
+            ON CONFLICT (session_id) DO UPDATE SET
+                trace_id     = EXCLUDED.trace_id,
+                trace_ids    = EXCLUDED.trace_ids,
+                eval_set_id  = EXCLUDED.eval_set_id,
+                source       = EXCLUDED.source,
+                is_complete  = EXCLUDED.is_complete,
+                has_root_span= EXCLUDED.has_root_span,
+                metadata     = EXCLUDED.metadata,
+                started_at   = EXCLUDED.started_at,
+                completed_at = EXCLUDED.completed_at,
+                updated_at   = now()
+            """,
+            session.session_id,
+            session.trace_id,
+            sorted(session.trace_ids),
+            session.eval_set_id,
+            session.source,
+            session.is_complete,
+            session.has_root_span,
+            json.dumps(session.metadata or {}),
+            session.started_at,
+            session.completed_at,
+        )
+
+    async def delete(self, session_id: str) -> None:
+        await self._pool.execute(f"DELETE FROM {self._t} WHERE session_id = $1", session_id)
+
+    async def list_all(self) -> "list[TraceSession]":
+        rows = await self._pool.fetch(f"SELECT * FROM {self._t} ORDER BY started_at DESC")
+        return [_row_to_session(r) for r in rows]
+
+    async def find_by_trace_id(self, trace_id: str) -> "TraceSession | None":
+        row = await self._pool.fetchrow(
+            f"SELECT * FROM {self._t} WHERE $1 = ANY(trace_ids) OR trace_id = $1 ORDER BY started_at DESC LIMIT 1",
+            trace_id,
+        )
+        return _row_to_session(row) if row else None
+
+
+class PostgresRunRepository:
+    def __init__(self, pool: "asyncpg.Pool", schema: str) -> None:
+        self._pool = pool
+        self._schema = schema
+
+    @property
+    def _t(self) -> str:
+        return f'"{self._schema}".run'
+
+    async def create(self, run: Run) -> Run:
+        spec_json = run.spec.model_dump_json(by_alias=False)
+        row = await self._pool.fetchrow(
+            f"""
+            INSERT INTO {self._t}
+                (run_id, status, approach, spec, attempt, created_at)
+            VALUES ($1, $2, $3, $4::jsonb, 0, $5)
+            ON CONFLICT (run_id) DO NOTHING
+            RETURNING *
+            """,
+            run.run_id,
+            run.status.value,
+            run.spec.approach,
+            spec_json,
+            run.created_at,
+        )
+        if row is not None:
+            return _row_to_run(row)
+        existing = await self.get(run.run_id)
+        if existing is None:
+            raise RuntimeError(f"run {run.run_id} disappeared between INSERT ... ON CONFLICT and SELECT")
+        return existing
+
+    async def get(self, run_id: UUID) -> Run | None:
+        row = await self._pool.fetchrow(f"SELECT * FROM {self._t} WHERE run_id = $1", run_id)
+        return _row_to_run(row) if row else None
+
+    async def list(
+        self,
+        *,
+        status: list[RunStatus] | None = None,
+        limit: int = 100,
+        before: datetime | None = None,
+    ) -> list[Run]:
+        clauses: list[str] = []
+        args: list[object] = []
+        if status:
+            args.append([s.value for s in status])
+            clauses.append(f"status = ANY(${len(args)})")
+        if before:
+            args.append(before)
+            clauses.append(f"created_at < ${len(args)}")
+        where = ("WHERE " + " AND ".join(clauses)) if clauses else ""
+        args.append(limit)
+        rows = await self._pool.fetch(
+            f"SELECT * FROM {self._t} {where} ORDER BY created_at DESC LIMIT ${len(args)}",
+            *args,
+        )
+        return [_row_to_run(r) for r in rows]
+
+    async def claim_next(self, *, worker_id: str, lease: timedelta, max_attempts: int) -> Run | None:
+        lease_seconds = int(lease.total_seconds())
+        async with self._pool.acquire() as conn:
+            async with conn.transaction():
+                row = await conn.fetchrow(
+                    f"""
+                    UPDATE {self._t}
+                       SET status = 'running',
+                           worker_id = $1,
+                           claimed_at = now(),
+                           lease_expires_at = now() + make_interval(secs => $2),
+                           started_at = COALESCE(started_at, now()),
+                           attempt = attempt + 1
+                     WHERE run_id = (
+                       SELECT run_id FROM {self._t}
+                        WHERE attempt < $3
+                          AND cancel_requested = FALSE
+                          AND (status = 'queued'
+                               OR (status = 'running' AND lease_expires_at < now()))
+                        ORDER BY created_at
+                        LIMIT 1
+                        FOR UPDATE SKIP LOCKED
+                     )
+                     RETURNING *
+                    """,
+                    worker_id,
+                    lease_seconds,
+                    max_attempts,
+                )
+        return _row_to_run(row) if row else None
+
+    async def heartbeat(self, run_id: UUID, worker_id: str, lease: timedelta) -> bool:
+        lease_seconds = int(lease.total_seconds())
+        row = await self._pool.fetchrow(
+            f"""
+            UPDATE {self._t}
+               SET lease_expires_at = now() + make_interval(secs => $1)
+             WHERE run_id = $2
+               AND worker_id = $3
+               AND status = 'running'
+               AND cancel_requested = FALSE
+            RETURNING run_id
+            """,
+            lease_seconds,
+            run_id,
+            worker_id,
+        )
+        return row is not None
+
+    async def update_status(
+        self,
+        run_id: UUID,
+        status: RunStatus,
+        *,
+        error: str | None = None,
+        summary: dict | None = None,
+    ) -> None:
+        terminal = status in (RunStatus.SUCCEEDED, RunStatus.FAILED, RunStatus.CANCELLED)
+        await self._pool.execute(
+            f"""
+            UPDATE {self._t}
+               SET status = $1,
+                   error = COALESCE($2, error),
+                   summary = COALESCE($3::jsonb, summary),
+                   finished_at = CASE WHEN $4 THEN now() ELSE finished_at END,
+                   worker_id = CASE WHEN $4 THEN NULL ELSE worker_id END,
+                   lease_expires_at = CASE WHEN $4 THEN NULL ELSE lease_expires_at END,
+                   claimed_at = CASE WHEN $4 THEN NULL ELSE claimed_at END
+             WHERE run_id = $5
+            """,
+            status.value,
+            error,
+            json.dumps(summary) if summary is not None else None,
+            terminal,
+            run_id,
+        )
+
+    async def cancel(self, run_id: UUID) -> bool:
+        row = await self._pool.fetchrow(
+            f"""
+            UPDATE {self._t}
+               SET cancel_requested = TRUE,
+                   status = CASE WHEN status = 'queued' THEN 'cancelled' ELSE status END,
+                   finished_at = CASE WHEN status = 'queued' THEN now() ELSE finished_at END
+             WHERE run_id = $1
+               AND status IN ('queued', 'running')
+            RETURNING run_id
+            """,
+            run_id,
+        )
+        return row is not None
+
+
+class PostgresResultRepository:
+    def __init__(self, pool: "asyncpg.Pool", schema: str) -> None:
+        self._pool = pool
+        self._schema = schema
+
+    @property
+    def _t(self) -> str:
+        return f'"{self._schema}".result'
+
+    async def upsert_many(self, run_id: UUID, results: list[Result]) -> None:
+        if not results:
+            return
+        rows = [
+            (
+                r.result_id,
+                r.run_id,
+                r.eval_set_item_id,
+                r.eval_set_item_name,
+                r.evaluator_name,
+                r.evaluator_type,
+                r.status.value,
+                r.score,
+                [s for s in r.per_invocation_scores if s is not None],
+                r.trace_id,
+                r.span_id,
+                json.dumps(r.details or {}),
+                r.error_text,
+                json.dumps(r.tokens_used) if r.tokens_used is not None else None,
+                r.latency_ms,
+                r.created_at,
+            )
+            for r in results
+        ]
+        async with self._pool.acquire() as conn:
+            async with conn.transaction():
+                await conn.executemany(
+                    f"""
+                    INSERT INTO {self._t}
+                        (result_id, run_id, eval_set_item_id, eval_set_item_name,
+                         evaluator_name, evaluator_type, status, score,
+                         per_invocation_scores, trace_id, span_id, details,
+                         error_text, tokens_used, latency_ms, created_at)
+                    VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12::jsonb,
+                            $13, $14::jsonb, $15, $16)
+                    ON CONFLICT (result_id) DO UPDATE SET
+                        status = EXCLUDED.status,
+                        score = EXCLUDED.score,
+                        per_invocation_scores = EXCLUDED.per_invocation_scores,
+                        details = EXCLUDED.details,
+                        error_text = EXCLUDED.error_text,
+                        tokens_used = EXCLUDED.tokens_used,
+                        latency_ms = EXCLUDED.latency_ms
+                    """,
+                    rows,
+                )
+
+    async def list_by_run(self, run_id: UUID) -> list[Result]:
+        rows = await self._pool.fetch(
+            f"SELECT * FROM {self._t} WHERE run_id = $1 ORDER BY created_at",
+            run_id,
+        )
+        return [_row_to_result(r) for r in rows]
+
+    async def delete_by_run(self, run_id: UUID) -> None:
+        await self._pool.execute(f"DELETE FROM {self._t} WHERE run_id = $1", run_id)
+
+
+class PostgresRepos(Repos):
+    """Repos backed by a single asyncpg pool. ``close()`` shuts the pool down."""
+
+    def __init__(self, *, pool: "asyncpg.Pool", schema: str) -> None:
+        super().__init__(
+            sessions=PostgresSessionRepository(pool, schema),
+            runs=PostgresRunRepository(pool, schema),
+            results=PostgresResultRepository(pool, schema),
+            backend="postgres",
+        )
+        self._pool = pool
+
+    @classmethod
+    async def create(cls, *, pool: "asyncpg.Pool", schema: str) -> "PostgresRepos":
+        return cls(pool=pool, schema=schema)
+
+    async def close(self) -> None:
+        await self._pool.close()
diff --git a/tests/api/__init__.py b/tests/api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/api/test_evaluate_persistence.py b/tests/api/test_evaluate_persistence.py
new file mode 100644
index 0000000..d4bd690
--- /dev/null
+++ b/tests/api/test_evaluate_persistence.py
@@ -0,0 +1,173 @@
+"""Option A: /api/evaluate variants persist when run_service is configured.
+
+These tests stub a memory-backed RunService onto app.state so we can drive
+the persistence path without standing up a real Postgres. The lifespan
+itself only configures run_service when AGENTEVALS_STORAGE_BACKEND=postgres,
+so production behavior matches: memory backend leaves runId=null and never
+writes; postgres backend persists.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+from pathlib import Path
+
+import pytest
+from fastapi.testclient import TestClient
+
+from agentevals.api.app import create_app
+from agentevals.run.service import RunService
+from agentevals.storage.repos.memory import MemoryRepos
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+SAMPLE_TRACE = REPO_ROOT / "samples" / "helm.json"
+
+
+def _has_sample() -> bool:
+    return SAMPLE_TRACE.exists()
+
+
+@pytest.fixture
+def app_no_runs():
+    """No run_service injected, so /api/evaluate runs but does not persist."""
+    return create_app()
+
+
+@pytest.fixture
+def app_with_runs():
+    """Memory-backed run_service simulates the postgres-enabled deployment."""
+    repos = MemoryRepos.create()
+    app = create_app()
+    app.state.run_service = RunService(repos.runs, repos.results)
+    return app, repos
+
+
+@pytest.mark.skipif(not _has_sample(), reason="samples/helm.json missing")
+class TestEvaluateMultipartSync:
+    def test_no_run_id_in_response_when_run_service_unset(self, app_no_runs):
+        with TestClient(app_no_runs) as client:
+            with SAMPLE_TRACE.open("rb") as f:
+                r = client.post(
+                    "/api/evaluate",
+                    files={"trace_files": ("helm.json", f, "application/json")},
+                    data={"config": '{"metrics": ["tool_trajectory_avg_score"]}'},
+                )
+        assert r.status_code == 200
+        assert r.json()["data"].get("runId") is None
+
+    def test_run_persisted_when_run_service_set(self, app_with_runs):
+        app, repos = app_with_runs
+        with TestClient(app) as client:
+            with SAMPLE_TRACE.open("rb") as f:
+                r = client.post(
+                    "/api/evaluate",
+                    files={"trace_files": ("helm.json", f, "application/json")},
+                    data={"config": '{"metrics": ["tool_trajectory_avg_score"]}'},
+                )
+        assert r.status_code == 200
+        run_id = r.json()["data"]["runId"]
+        assert run_id is not None
+        runs = asyncio.run(repos.runs.list())
+        assert len(runs) == 1
+        run = runs[0]
+        assert str(run.run_id) == run_id
+        # Status is succeeded because no top-level errors fired even though
+        # the metric_result inside may have errored (no eval_set provided).
+        assert run.status.value in ("succeeded", "failed")
+        # The "uploaded" target kind captures audit metadata about the upload
+        assert run.spec.target.kind == "uploaded"
+        assert run.spec.target.trace_files == ["helm.json"]
+        assert run.spec.target.trace_count == 1
+
+    def test_results_persisted_alongside_run(self, app_with_runs):
+        app, repos = app_with_runs
+        with TestClient(app) as client:
+            with SAMPLE_TRACE.open("rb") as f:
+                r = client.post(
+                    "/api/evaluate",
+                    files={"trace_files": ("helm.json", f, "application/json")},
+                    data={"config": '{"metrics": ["tool_trajectory_avg_score"]}'},
+                )
+        run_id = r.json()["data"]["runId"]
+        results = asyncio.run(repos.results.list_by_run(_uuid(run_id)))
+        assert len(results) >= 1
+        for res in results:
+            assert res.evaluator_type in ("builtin", "code", "remote", "openai_eval")
+            assert res.run_id == _uuid(run_id)
+
+    def test_each_call_creates_distinct_run(self, app_with_runs):
+        """Multiple UI uploads accumulate in run history; each gets its own
+        Run row. This is the core OSS user value of Option A."""
+        app, repos = app_with_runs
+        with TestClient(app) as client:
+            for _ in range(3):
+                with SAMPLE_TRACE.open("rb") as f:
+                    client.post(
+                        "/api/evaluate",
+                        files={"trace_files": ("helm.json", f, "application/json")},
+                        data={"config": '{"metrics": ["tool_trajectory_avg_score"]}'},
+                    )
+        runs = asyncio.run(repos.runs.list())
+        assert len(runs) == 3
+        assert len({r.run_id for r in runs}) == 3
+
+    def test_persistence_failure_does_not_break_response(self, app_with_runs, monkeypatch):
+        """The eval result must reach the caller even if persistence fails;
+        history is best-effort, the eval contract is not."""
+        app, repos = app_with_runs
+
+        async def boom(*args, **kwargs):
+            raise RuntimeError("simulated persistence outage")
+
+        monkeypatch.setattr(app.state.run_service, "record_completed_eval", boom)
+        with TestClient(app) as client:
+            with SAMPLE_TRACE.open("rb") as f:
+                r = client.post(
+                    "/api/evaluate",
+                    files={"trace_files": ("helm.json", f, "application/json")},
+                    data={"config": '{"metrics": ["tool_trajectory_avg_score"]}'},
+                )
+        assert r.status_code == 200
+        assert r.json()["data"].get("runId") is None
+
+
+@pytest.mark.skipif(not _has_sample(), reason="samples/helm.json missing")
+class TestEvaluateSseStream:
+    def test_done_event_includes_run_id_when_persisted(self, app_with_runs):
+        app, _repos = app_with_runs
+        with TestClient(app) as client:
+            with SAMPLE_TRACE.open("rb") as f:
+                with client.stream(
+                    "POST",
+                    "/api/evaluate/stream",
+                    files={"trace_files": ("helm.json", f, "application/json")},
+                    data={"config": '{"metrics": ["tool_trajectory_avg_score"]}'},
+                ) as resp:
+                    body = b"".join(resp.iter_bytes()).decode()
+        # The done event payload is JSON in the last `data:` block.
+        done_payload = _last_done_payload(body)
+        assert done_payload is not None
+        assert done_payload.get("result", {}).get("runId") is not None
+
+
+def _last_done_payload(sse_text: str) -> dict | None:
+    """Pick the SSE event whose JSON carries ``done: true`` (the SSEDoneEvent
+    shape from api/models.py — ``{"done": true, "result": {...}}``)."""
+    last = None
+    for line in sse_text.splitlines():
+        if not line.startswith("data: "):
+            continue
+        try:
+            payload = json.loads(line[len("data: ") :])
+        except json.JSONDecodeError:
+            continue
+        if payload.get("done") is True:
+            last = payload
+    return last
+
+
+def _uuid(value):
+    from uuid import UUID
+
+    return UUID(value)
diff --git a/tests/api/test_runs_routes.py b/tests/api/test_runs_routes.py
new file mode 100644
index 0000000..6153006
--- /dev/null
+++ b/tests/api/test_runs_routes.py
@@ -0,0 +1,185 @@
+"""HTTP-level tests for /api/runs endpoints."""
+
+from __future__ import annotations
+
+import json
+from uuid import uuid4
+
+import pytest
+from fastapi.testclient import TestClient
+
+from agentevals.api.app import create_app
+from agentevals.run.service import RunService
+from agentevals.storage.repos.memory import MemoryRepos
+
+
+@pytest.fixture
+def memory_app(monkeypatch):
+    """App with the storage env unset; backend defaults to memory and
+    /api/runs handlers should return 503 with a configuration hint."""
+    for var in ("AGENTEVALS_STORAGE_BACKEND", "AGENTEVALS_DATABASE_URL"):
+        monkeypatch.delenv(var, raising=False)
+    return create_app()
+
+
+@pytest.fixture
+def stubbed_app(memory_app):
+    """App that has a memory-backed RunService injected onto app.state, so
+    we can exercise /api/runs handler logic without standing up a real PG."""
+    repos = MemoryRepos.create()
+    memory_app.state.run_service = RunService(repos.runs, repos.results)
+    return memory_app, repos
+
+
+class TestMemoryBackendReturns503:
+    def test_get_runs(self, memory_app):
+        with TestClient(memory_app) as client:
+            r = client.get("/api/runs")
+        assert r.status_code == 503
+        assert "AGENTEVALS_STORAGE_BACKEND=postgres" in r.json()["detail"]
+
+    def test_post_run(self, memory_app):
+        with TestClient(memory_app) as client:
+            r = client.post(
+                "/api/runs",
+                json={"spec": {"approach": "trace_replay", "target": {"kind": "inline", "inline": {}}}},
+            )
+        assert r.status_code == 503
+
+    def test_get_run_by_id(self, memory_app):
+        with TestClient(memory_app) as client:
+            r = client.get(f"/api/runs/{uuid4()}")
+        assert r.status_code == 503
+
+    def test_get_run_results(self, memory_app):
+        with TestClient(memory_app) as client:
+            r = client.get(f"/api/runs/{uuid4()}/results")
+        assert r.status_code == 503
+
+    def test_cancel_run(self, memory_app):
+        with TestClient(memory_app) as client:
+            r = client.post(f"/api/runs/{uuid4()}/cancel")
+        assert r.status_code == 503
+
+    def test_health_endpoint_unaffected(self, memory_app):
+        with TestClient(memory_app) as client:
+            r = client.get("/api/health")
+        assert r.status_code == 200
+
+
+class TestSubmitRun:
+    def _payload(self, *, marker="x"):
+        return {
+            "spec": {
+                "approach": "trace_replay",
+                "target": {"kind": "inline", "inline": {"m": marker}},
+            }
+        }
+
+    def test_submit_returns_202(self, stubbed_app):
+        app, _ = stubbed_app
+        with TestClient(app) as client:
+            r = client.post("/api/runs", json=self._payload())
+        assert r.status_code == 202
+        body = r.json()
+        assert body["data"]["status"] == "queued"
+        assert body["data"]["runId"]
+
+    def test_submit_with_explicit_id(self, stubbed_app):
+        app, _ = stubbed_app
+        run_id = "11111111-1111-1111-1111-111111111111"
+        payload = {**self._payload(), "runId": run_id}
+        with TestClient(app) as client:
+            r = client.post("/api/runs", json=payload)
+        assert r.status_code == 202
+        assert r.json()["data"]["runId"] == run_id
+
+    def test_idempotent_resubmit_same_spec(self, stubbed_app):
+        app, _ = stubbed_app
+        run_id = "22222222-2222-2222-2222-222222222222"
+        payload = {**self._payload(marker="same"), "runId": run_id}
+        with TestClient(app) as client:
+            r1 = client.post("/api/runs", json=payload)
+            r2 = client.post("/api/runs", json=payload)
+        assert r1.status_code == 202
+        assert r2.status_code == 202
+        assert r1.json()["data"]["runId"] == r2.json()["data"]["runId"]
+
+    def test_resubmit_with_different_spec_returns_409(self, stubbed_app):
+        app, _ = stubbed_app
+        run_id = "33333333-3333-3333-3333-333333333333"
+        with TestClient(app) as client:
+            r1 = client.post("/api/runs", json={**self._payload(marker="A"), "runId": run_id})
+            r2 = client.post("/api/runs", json={**self._payload(marker="B"), "runId": run_id})
+        assert r1.status_code == 202
+        assert r2.status_code == 409
+        body = r2.json()
+        assert "already exists" in body["detail"]["message"]
+        assert body["detail"]["persisted"]["runId"] == run_id
+
+    def test_invalid_target_kind_rejected(self, stubbed_app):
+        app, _ = stubbed_app
+        with TestClient(app) as client:
+            r = client.post(
+                "/api/runs",
+                json={"spec": {"approach": "trace_replay", "target": {"kind": "not-a-kind"}}},
+            )
+        assert r.status_code == 422
+
+
+class TestGetAndListRuns:
+    def test_unknown_run_id_returns_404(self, stubbed_app):
+        app, _ = stubbed_app
+        with TestClient(app) as client:
+            r = client.get(f"/api/runs/{uuid4()}")
+        assert r.status_code == 404
+
+    def test_list_empty_then_after_submit(self, stubbed_app):
+        app, _ = stubbed_app
+        with TestClient(app) as client:
+            r1 = client.get("/api/runs")
+            assert r1.json()["data"] == []
+            client.post(
+                "/api/runs",
+                json={"spec": {"approach": "trace_replay", "target": {"kind": "inline", "inline": {}}}},
+            )
+            r2 = client.get("/api/runs")
+            assert len(r2.json()["data"]) == 1
+
+    def test_list_status_filter(self, stubbed_app):
+        app, repos = stubbed_app
+        with TestClient(app) as client:
+            client.post(
+                "/api/runs",
+                json={"spec": {"approach": "trace_replay", "target": {"kind": "inline", "inline": {}}}},
+            )
+            r = client.get("/api/runs?status=queued")
+            assert len(r.json()["data"]) == 1
+            r = client.get("/api/runs?status=succeeded")
+            assert r.json()["data"] == []
+
+
+class TestCancelRun:
+    def test_cancel_unknown_run_404(self, stubbed_app):
+        app, _ = stubbed_app
+        with TestClient(app) as client:
+            r = client.post(f"/api/runs/{uuid4()}/cancel")
+        assert r.status_code == 404
+
+    def test_cancel_queued_run_marks_cancelled(self, stubbed_app):
+        app, _ = stubbed_app
+        with TestClient(app) as client:
+            sub = client.post(
+                "/api/runs",
+                json={"spec": {"approach": "trace_replay", "target": {"kind": "inline", "inline": {}}}},
+            )
+            run_id = sub.json()["data"]["runId"]
+            r = client.post(f"/api/runs/{run_id}/cancel")
+        assert r.status_code == 200
+        assert r.json()["data"]["status"] == "cancelled"
+
+    def test_get_run_results_for_unknown_run_404(self, stubbed_app):
+        app, _ = stubbed_app
+        with TestClient(app) as client:
+            r = client.get(f"/api/runs/{uuid4()}/results")
+        assert r.status_code == 404
diff --git a/tests/run/__init__.py b/tests/run/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/run/test_fetcher.py b/tests/run/test_fetcher.py
new file mode 100644
index 0000000..833975f
--- /dev/null
+++ b/tests/run/test_fetcher.py
@@ -0,0 +1,79 @@
+"""Trace fetcher dispatch + InlineTraceFetcher behavior."""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from agentevals.run.fetcher import HttpTraceFetcher, InlineTraceFetcher, resolve_fetcher
+from agentevals.storage.models import TraceTarget
+
+
+class TestResolveFetcher:
+    def test_inline_returns_inline_fetcher(self):
+        f = resolve_fetcher(TraceTarget(kind="inline", inline={}))
+        assert isinstance(f, InlineTraceFetcher)
+
+    def test_http_returns_http_fetcher(self):
+        f = resolve_fetcher(TraceTarget(kind="http", base_url="https://x", trace_id="abc"))
+        assert isinstance(f, HttpTraceFetcher)
+
+    def test_uploaded_rejected_with_clear_error(self):
+        """Uploaded targets cannot be re-executed by the worker; they only
+        record audit metadata for /api/evaluate calls. resolve_fetcher must
+        raise rather than silently returning None or a fallback fetcher."""
+        with pytest.raises(ValueError, match="cannot be re-executed"):
+            resolve_fetcher(TraceTarget(kind="uploaded"))
+
+
+class TestInlineTraceFetcher:
+    async def test_loads_jaeger_format(self, tmp_path):
+        sample = {
+            "data": [
+                {
+                    "traceID": "1234",
+                    "spans": [
+                        {
+                            "traceID": "1234",
+                            "spanID": "abcd",
+                            "operationName": "op",
+                            "startTime": 1000,
+                            "duration": 100,
+                            "tags": [],
+                            "logs": [],
+                            "references": [],
+                            "processID": "p1",
+                        }
+                    ],
+                    "processes": {"p1": {"serviceName": "svc"}},
+                }
+            ]
+        }
+        fetcher = InlineTraceFetcher()
+        traces = await fetcher.fetch(
+            TraceTarget(kind="inline", inline=sample),
+            context={},
+        )
+        assert len(traces) >= 1
+
+    async def test_missing_inline_raises(self):
+        fetcher = InlineTraceFetcher()
+        with pytest.raises(ValueError, match="target.inline"):
+            await fetcher.fetch(TraceTarget(kind="inline"), context={})
+
+
+class TestHttpTraceFetcher:
+    """HttpTraceFetcher hits the network; we test the validation path that
+    runs before any HTTP traffic. End-to-end HTTP behavior is covered by
+    the run-flow integration test."""
+
+    async def test_missing_base_url_raises(self):
+        fetcher = HttpTraceFetcher()
+        with pytest.raises(ValueError, match="base_url"):
+            await fetcher.fetch(TraceTarget(kind="http", trace_id="abc"), context={})
+
+    async def test_missing_trace_id_raises(self):
+        fetcher = HttpTraceFetcher()
+        with pytest.raises(ValueError, match="base_url"):
+            await fetcher.fetch(TraceTarget(kind="http"), context={})
diff --git a/tests/run/test_result_builder.py b/tests/run/test_result_builder.py
new file mode 100644
index 0000000..19b46c2
--- /dev/null
+++ b/tests/run/test_result_builder.py
@@ -0,0 +1,94 @@
+"""Pure-function tests for build_results / summarize_run_result / classify_evaluator."""
+
+from __future__ import annotations
+
+from uuid import UUID, uuid4
+
+from agentevals.config import BuiltinMetricDef, CodeEvaluatorDef, EvalParams
+from agentevals.run.result_builder import build_results, classify_evaluator, summarize_run_result
+from agentevals.runner import MetricResult, RunResult, TraceResult
+from agentevals.storage.models import ResultStatus
+
+
+def _params(custom_evaluators=None) -> EvalParams:
+    return EvalParams(metrics=["m_builtin"], custom_evaluators=custom_evaluators or [])
+
+
+def _trace_result(*metrics) -> TraceResult:
+    return TraceResult(trace_id="trace-1", num_invocations=1, metric_results=list(metrics))
+
+
+def _mr(name="m_builtin", **kw):
+    kw.setdefault("eval_status", "PASSED")
+    return MetricResult(metric_name=name, **kw)
+
+
+class TestClassifyEvaluator:
+    def test_unknown_falls_back_to_builtin(self):
+        assert classify_evaluator("unknown", _params()) == "builtin"
+
+    def test_custom_code_classified_correctly(self):
+        params = _params(custom_evaluators=[CodeEvaluatorDef(name="my_code", path="./e.py")])
+        assert classify_evaluator("my_code", params) == "code"
+
+    def test_builtin_in_metrics_list(self):
+        """Even when explicitly listed in params.metrics, the absence of a
+        matching custom_evaluators entry defaults to 'builtin'. This is
+        intentional: the persisted result row needs a stable type label and
+        custom evaluators are the only ones we can disambiguate by name."""
+        assert classify_evaluator("m_builtin", _params()) == "builtin"
+
+
+class TestBuildResults:
+    def test_one_metric_per_trace_yields_one_result(self):
+        run_id = uuid4()
+        rr = RunResult(trace_results=[_trace_result(_mr())])
+        results = build_results(run_id, _params(), rr)
+        assert len(results) == 1
+        assert results[0].run_id == run_id
+        assert results[0].evaluator_name == "m_builtin"
+
+    def test_multiple_metrics_flatten(self):
+        rr = RunResult(
+            trace_results=[
+                _trace_result(_mr(name="a"), _mr(name="b"), _mr(name="c")),
+                _trace_result(_mr(name="a")),
+            ]
+        )
+        results = build_results(uuid4(), _params(), rr)
+        assert len(results) == 4
+        names = sorted(r.evaluator_name for r in results)
+        assert names == ["a", "a", "b", "c"]
+
+    def test_eval_set_item_id_defaults_to_trace_id(self):
+        """OSS scope: no per-eval-case id extraction. Trace id is the stable
+        identifier for both eval_set_item_id and eval_set_item_name. Test
+        locks this so future changes are deliberate."""
+        rr = RunResult(trace_results=[_trace_result(_mr())])
+        result = build_results(uuid4(), _params(), rr)[0]
+        assert result.eval_set_item_id == "trace-1"
+        assert result.eval_set_item_name == "trace-1"
+        assert result.trace_id == "trace-1"
+
+
+class TestSummarizeRunResult:
+    def test_counts_pass_fail_skip_error(self):
+        rr = RunResult(
+            trace_results=[
+                _trace_result(
+                    _mr(eval_status="PASSED"),
+                    _mr(eval_status="FAILED"),
+                    _mr(eval_status="NOT_EVALUATED"),
+                    _mr(error="boom"),
+                )
+            ]
+        )
+        summary = summarize_run_result(rr)
+        assert summary["result_counts"] == {"passed": 1, "failed": 1, "skipped": 1, "errored": 1}
+        assert summary["trace_count"] == 1
+
+    def test_propagates_errors_and_perf(self):
+        rr = RunResult(errors=["loader failure"], performance_metrics={"p50": 100})
+        summary = summarize_run_result(rr)
+        assert summary["errors"] == ["loader failure"]
+        assert summary["performance_metrics"] == {"p50": 100}
diff --git a/tests/run/test_service.py b/tests/run/test_service.py
new file mode 100644
index 0000000..e5effa8
--- /dev/null
+++ b/tests/run/test_service.py
@@ -0,0 +1,155 @@
+"""RunService unit tests against memory repos."""
+
+from __future__ import annotations
+
+from uuid import uuid4
+
+import pytest
+
+from agentevals.config import EvalParams
+from agentevals.run.service import RunService, RunSubmitConflict
+from agentevals.runner import MetricResult, RunResult, TraceResult
+from agentevals.storage.models import RunSpec, RunStatus, TraceTarget
+from agentevals.storage.repos.memory import MemoryRepos
+
+
+def _spec(*, marker: str = "default") -> RunSpec:
+    return RunSpec(
+        approach="trace_replay",
+        target=TraceTarget(kind="inline", inline={"marker": marker}),
+    )
+
+
+@pytest.fixture
+def service():
+    repos = MemoryRepos.create()
+    return RunService(repos.runs, repos.results), repos
+
+
+class TestRunServiceSubmit:
+    async def test_first_submit_creates_run(self, service):
+        svc, _ = service
+        run = await svc.submit(run_id=None, spec=_spec())
+        assert run.run_id is not None
+        assert run.status == RunStatus.QUEUED
+
+    async def test_resubmit_with_same_id_and_spec_idempotent(self, service):
+        svc, _ = service
+        run = await svc.submit(run_id=None, spec=_spec())
+        again = await svc.submit(run_id=run.run_id, spec=_spec())
+        assert again.run_id == run.run_id
+
+    async def test_resubmit_with_different_spec_raises_conflict(self, service):
+        """409 path: re-submitting an existing run_id with a different spec
+        must NOT overwrite the persisted row, and must surface the persisted
+        spec to the caller for reconciliation."""
+        svc, _ = service
+        run = await svc.submit(run_id=None, spec=_spec(marker="A"))
+        with pytest.raises(RunSubmitConflict) as excinfo:
+            await svc.submit(run_id=run.run_id, spec=_spec(marker="B"))
+        # The persisted spec attached to the exception should be the original
+        assert excinfo.value.persisted.spec.target.inline == {"marker": "A"}
+
+    async def test_explicit_run_id_honored(self, service):
+        svc, _ = service
+        run_id = uuid4()
+        run = await svc.submit(run_id=run_id, spec=_spec())
+        assert run.run_id == run_id
+
+
+class TestRunServiceQueries:
+    async def test_get_returns_none_for_unknown(self, service):
+        svc, _ = service
+        assert await svc.get(uuid4()) is None
+
+    async def test_list_returns_empty_initially(self, service):
+        svc, _ = service
+        assert await svc.list() == []
+
+    async def test_list_after_submit(self, service):
+        svc, _ = service
+        await svc.submit(run_id=None, spec=_spec())
+        await svc.submit(run_id=None, spec=_spec())
+        runs = await svc.list()
+        assert len(runs) == 2
+
+    async def test_cancel_unknown_run_returns_false(self, service):
+        svc, _ = service
+        assert await svc.cancel(uuid4()) is False
+
+
+class TestRecordCompletedEval:
+    """Option A: /api/evaluate synchronously persists runs + results."""
+
+    def _params(self) -> EvalParams:
+        return EvalParams(metrics=["m1"])
+
+    def _run_result(self, *, errors=None, metrics=None) -> RunResult:
+        return RunResult(
+            trace_results=[
+                TraceResult(
+                    trace_id="trace-1",
+                    num_invocations=1,
+                    metric_results=metrics or [MetricResult(metric_name="m1", eval_status="PASSED", score=0.9)],
+                )
+            ],
+            errors=errors or [],
+        )
+
+    async def test_persists_run_as_succeeded_when_no_errors(self, service):
+        svc, repos = service
+        run = await svc.record_completed_eval(
+            spec=_spec(),
+            params=self._params(),
+            run_result=self._run_result(),
+        )
+        assert run.status == RunStatus.SUCCEEDED
+        listed = await repos.runs.list()
+        assert len(listed) == 1
+        assert listed[0].status == RunStatus.SUCCEEDED
+
+    async def test_persists_run_as_failed_when_errors_present(self, service):
+        svc, repos = service
+        run = await svc.record_completed_eval(
+            spec=_spec(),
+            params=self._params(),
+            run_result=self._run_result(errors=["loader failed"]),
+        )
+        assert run.status == RunStatus.FAILED
+        assert run.error and "loader failed" in run.error
+        listed = await repos.runs.list()
+        assert listed[0].status == RunStatus.FAILED
+
+    async def test_persists_result_rows(self, service):
+        svc, repos = service
+        run = await svc.record_completed_eval(
+            spec=_spec(),
+            params=self._params(),
+            run_result=self._run_result(),
+        )
+        results = await repos.results.list_by_run(run.run_id)
+        assert len(results) == 1
+        assert results[0].evaluator_name == "m1"
+
+    async def test_summary_attached_to_run(self, service):
+        svc, _ = service
+        run = await svc.record_completed_eval(
+            spec=_spec(),
+            params=self._params(),
+            run_result=self._run_result(
+                metrics=[
+                    MetricResult(metric_name="m1", eval_status="PASSED"),
+                    MetricResult(metric_name="m2", eval_status="FAILED"),
+                ]
+            ),
+        )
+        assert run.summary is not None
+        assert run.summary["result_counts"]["passed"] == 1
+        assert run.summary["result_counts"]["failed"] == 1
+
+    async def test_each_call_creates_distinct_run(self, service):
+        svc, repos = service
+        a = await svc.record_completed_eval(spec=_spec(), params=self._params(), run_result=self._run_result())
+        b = await svc.record_completed_eval(spec=_spec(), params=self._params(), run_result=self._run_result())
+        assert a.run_id != b.run_id
+        assert len(await repos.runs.list()) == 2
diff --git a/tests/run/test_sinks.py b/tests/run/test_sinks.py
new file mode 100644
index 0000000..39ffd38
--- /dev/null
+++ b/tests/run/test_sinks.py
@@ -0,0 +1,248 @@
+"""Result sink tests.
+
+Covers stdout / file sinks fully in-process and HttpWebhookSink against a
+mock httpx transport so we exercise retry behavior without touching the network.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import json
+from pathlib import Path
+from unittest.mock import patch
+from uuid import UUID, uuid4
+
+import httpx
+import pytest
+
+from agentevals.run.sinks import (
+    FileSink,
+    HttpWebhookSink,
+    SinkFanout,
+    StdoutSink,
+    build_sinks,
+)
+from agentevals.storage.models import Result, ResultStatus
+
+
+@contextlib.contextmanager
+def _mock_async_client(transport: httpx.MockTransport):
+    """Patch agentevals.run.sinks.httpx.AsyncClient so the sink's
+    ``async with httpx.AsyncClient(...)`` call routes through the mock
+    transport. Patching the symbol on the sinks module beats patching
+    httpx globally, which can leak into other tests."""
+    import agentevals.run.sinks as sinks_module
+
+    real = httpx.AsyncClient
+
+    def _factory(*args, **kwargs):
+        kwargs["transport"] = transport
+        return real(*args, **kwargs)
+
+    with patch.object(sinks_module.httpx, "AsyncClient", _factory):
+        yield
+
+
+def _result(run_id: UUID) -> Result:
+    return Result(
+        result_id="rid-1",
+        run_id=run_id,
+        eval_set_item_id="item-1",
+        eval_set_item_name="trace-1",
+        evaluator_name="m1",
+        evaluator_type="builtin",
+        status=ResultStatus.PASSED,
+        score=0.9,
+    )
+
+
+class TestFileSink:
+    async def test_emits_partial_and_final(self, tmp_path):
+        path = tmp_path / "out.jsonl"
+        sink = FileSink(path)
+        run_id = uuid4()
+        await sink.emit_partial(run_id, [_result(run_id)], attempt=1)
+        await sink.emit_final(run_id, {"trace_count": 1}, attempt=1)
+        await sink.emit_error(run_id, "boom", attempt=1)
+        lines = path.read_text().strip().splitlines()
+        assert len(lines) == 3
+        partial = json.loads(lines[0])
+        assert partial["phase"] == "partial"
+        final = json.loads(lines[1])
+        assert final["phase"] == "final"
+        assert final["summary"] == {"trace_count": 1}
+        error = json.loads(lines[2])
+        assert error["phase"] == "error"
+
+    async def test_creates_parent_directory(self, tmp_path):
+        path = tmp_path / "deep" / "nested" / "out.jsonl"
+        sink = FileSink(path)
+        await sink.emit_final(uuid4(), {}, attempt=1)
+        assert path.exists()
+
+
+class TestStdoutSink:
+    async def test_writes_to_stdout(self, capsys):
+        sink = StdoutSink()
+        run_id = uuid4()
+        await sink.emit_partial(run_id, [_result(run_id)], attempt=1)
+        await sink.emit_final(run_id, {"k": "v"}, attempt=1)
+        captured = capsys.readouterr().out
+        lines = captured.strip().splitlines()
+        assert len(lines) == 2
+        assert json.loads(lines[0])["phase"] == "partial"
+        assert json.loads(lines[1])["phase"] == "final"
+
+
+class TestHttpWebhookSink:
+    async def test_post_succeeds_on_2xx(self):
+        captured: list[httpx.Request] = []
+
+        def handler(request: httpx.Request) -> httpx.Response:
+            captured.append(request)
+            return httpx.Response(200, json={})
+
+        transport = httpx.MockTransport(handler)
+        sink = HttpWebhookSink("https://h/x")
+        run_id = uuid4()
+        with _mock_async_client(transport):
+            await sink.emit_final(run_id, {"k": "v"}, attempt=1)
+        assert len(captured) == 1
+        body = json.loads(captured[0].content)
+        assert body["phase"] == "final"
+        assert body["run_id"] == str(run_id)
+
+    async def test_4xx_does_not_retry(self):
+        """4xx means the receiver rejected the payload (auth, validation,
+        etc); retrying would just hammer them. Errors are logged but the
+        run still completes."""
+        calls = 0
+
+        def handler(request: httpx.Request) -> httpx.Response:
+            nonlocal calls
+            calls += 1
+            return httpx.Response(401, json={"error": "unauthorized"})
+
+        transport = httpx.MockTransport(handler)
+        sink = HttpWebhookSink("https://h/x", max_attempts=5)
+        with _mock_async_client(transport):
+            await sink.emit_final(uuid4(), {}, attempt=1)
+        assert calls == 1
+
+    async def test_5xx_retries_then_gives_up(self):
+        calls = 0
+
+        def handler(request: httpx.Request) -> httpx.Response:
+            nonlocal calls
+            calls += 1
+            return httpx.Response(503, text="busy")
+
+        transport = httpx.MockTransport(handler)
+        sink = HttpWebhookSink("https://h/x", max_attempts=3)
+        with _mock_async_client(transport):
+            await sink.emit_final(uuid4(), {}, attempt=1)
+        assert calls == 3
+
+    async def test_headers_from_env_resolved_at_emit_time(self, monkeypatch):
+        """Reading env vars at emit time means a host can rotate the auth
+        token between runs without restarting agentevals."""
+        captured: list[dict] = []
+
+        def handler(request: httpx.Request) -> httpx.Response:
+            captured.append(dict(request.headers))
+            return httpx.Response(200)
+
+        transport = httpx.MockTransport(handler)
+        sink = HttpWebhookSink(
+            "https://h/x",
+            headers={"X-Static": "literal"},
+            headers_from_env={"Authorization": "AGENTEVALS_TEST_BEARER"},
+        )
+        monkeypatch.setenv("AGENTEVALS_TEST_BEARER", "Bearer token-v1")
+        with _mock_async_client(transport):
+            await sink.emit_final(uuid4(), {}, attempt=1)
+        assert captured[0].get("authorization") == "Bearer token-v1"
+        assert captured[0].get("x-static") == "literal"
+
+    async def test_headers_from_env_skipped_when_unset(self, monkeypatch):
+        captured: list[dict] = []
+
+        def handler(request: httpx.Request) -> httpx.Response:
+            captured.append(dict(request.headers))
+            return httpx.Response(200)
+
+        transport = httpx.MockTransport(handler)
+        sink = HttpWebhookSink(
+            "https://h/x",
+            headers_from_env={"Authorization": "AGENTEVALS_TEST_UNSET_VAR"},
+        )
+        monkeypatch.delenv("AGENTEVALS_TEST_UNSET_VAR", raising=False)
+        with _mock_async_client(transport):
+            await sink.emit_final(uuid4(), {}, attempt=1)
+        assert "authorization" not in captured[0]
+
+
+class TestBuildSinks:
+    def test_stdout(self):
+        fanout = build_sinks([{"kind": "stdout"}])
+        assert isinstance(fanout, SinkFanout)
+
+    def test_file(self, tmp_path):
+        fanout = build_sinks([{"kind": "file", "path": str(tmp_path / "x.jsonl")}])
+        assert isinstance(fanout, SinkFanout)
+
+    def test_http_webhook_with_auth_env_extraction(self):
+        fanout = build_sinks(
+            [
+                {
+                    "kind": "http_webhook",
+                    "url": "https://h/x",
+                    "auth": {
+                        "kind": "headers",
+                        "headers": {"Authorization": {"from_env": "MY_TOKEN"}},
+                    },
+                }
+            ]
+        )
+        assert isinstance(fanout, SinkFanout)
+
+    def test_unknown_kind_skipped_not_raised(self):
+        """Forward-compat: a host running a newer agentevals replica might
+        emit a sink kind older replicas don't know. Skipping with a warning
+        beats crashing the entire run."""
+        fanout = build_sinks([{"kind": "future_kind"}, {"kind": "stdout"}])
+        assert isinstance(fanout, SinkFanout)
+
+
+class TestSinkFanoutErrorIsolation:
+    """A sink that raises must not abort other sinks or the run itself."""
+
+    async def test_failures_logged_not_raised(self, capsys):
+        class BoomSink:
+            async def emit_partial(self, run_id, results, attempt):
+                raise RuntimeError("boom")
+
+            async def emit_final(self, run_id, summary, attempt):
+                raise RuntimeError("boom-final")
+
+            async def emit_error(self, run_id, error, attempt):
+                raise RuntimeError("boom-error")
+
+        good_writes = []
+
+        class GoodSink:
+            async def emit_partial(self, run_id, results, attempt):
+                good_writes.append("partial")
+
+            async def emit_final(self, run_id, summary, attempt):
+                good_writes.append("final")
+
+            async def emit_error(self, run_id, error, attempt):
+                good_writes.append("error")
+
+        fanout = SinkFanout([BoomSink(), GoodSink()])
+        run_id = uuid4()
+        await fanout.emit_partial(run_id, [], attempt=1)
+        await fanout.emit_final(run_id, {}, attempt=1)
+        await fanout.emit_error(run_id, "x", attempt=1)
+        assert good_writes == ["partial", "final", "error"]
diff --git a/tests/storage/__init__.py b/tests/storage/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/storage/test_config.py b/tests/storage/test_config.py
new file mode 100644
index 0000000..242d972
--- /dev/null
+++ b/tests/storage/test_config.py
@@ -0,0 +1,77 @@
+"""StorageSettings env loading and validation."""
+
+from __future__ import annotations
+
+import pytest
+
+from agentevals.storage.config import StorageSettings
+
+
+class TestStorageSettings:
+    def test_defaults(self):
+        s = StorageSettings()
+        assert s.backend == "memory"
+        assert s.database_url is None
+        assert s.schema_name == "agentevals"
+        assert s.max_concurrent_runs == 4
+
+    def test_lease_must_exceed_heartbeat(self):
+        """Catches operator misconfiguration at boot rather than at first
+        heartbeat: a lease shorter than the heartbeat interval lets workers
+        steal each other's runs."""
+        with pytest.raises(ValueError, match="lease"):
+            StorageSettings(lease_s=5, heartbeat_s=5)
+        with pytest.raises(ValueError, match="lease"):
+            StorageSettings(lease_s=3, heartbeat_s=5)
+
+    def test_postgres_requires_dsn(self):
+        with pytest.raises(ValueError, match="AGENTEVALS_DATABASE_URL"):
+            StorageSettings(backend="postgres", database_url=None)
+
+    def test_postgres_with_dsn_ok(self):
+        s = StorageSettings(backend="postgres", database_url="postgresql://h/db")
+        assert s.backend == "postgres"
+
+    def test_unknown_backend_rejected(self):
+        """Pydantic wraps the field_validator's ValueError in a
+        ValidationError; use the broader match on the inner message."""
+        with pytest.raises(Exception, match="unknown storage backend|sqlite"):
+            StorageSettings(backend="sqlite")
+
+    def test_from_env_reads_defaults(self, monkeypatch):
+        for var in [
+            "AGENTEVALS_STORAGE_BACKEND",
+            "AGENTEVALS_DATABASE_URL",
+            "AGENTEVALS_DATABASE_URL_FILE",
+            "AGENTEVALS_DATABASE_SCHEMA",
+            "AGENTEVALS_MAX_CONCURRENT_RUNS",
+        ]:
+            monkeypatch.delenv(var, raising=False)
+        s = StorageSettings.from_env()
+        assert s.backend == "memory"
+
+    def test_from_env_reads_postgres(self, monkeypatch):
+        monkeypatch.setenv("AGENTEVALS_STORAGE_BACKEND", "postgres")
+        monkeypatch.setenv("AGENTEVALS_DATABASE_URL", "postgresql://h/db")
+        monkeypatch.setenv("AGENTEVALS_DATABASE_SCHEMA", "custom_schema")
+        monkeypatch.setenv("AGENTEVALS_MAX_CONCURRENT_RUNS", "12")
+        s = StorageSettings.from_env()
+        assert s.backend == "postgres"
+        assert s.database_url == "postgresql://h/db"
+        assert s.schema_name == "custom_schema"
+        assert s.max_concurrent_runs == 12
+
+    def test_from_env_url_file_takes_precedence(self, tmp_path, monkeypatch):
+        dsn_file = tmp_path / "dsn"
+        dsn_file.write_text("postgresql://from-file/db\n")
+        monkeypatch.setenv("AGENTEVALS_STORAGE_BACKEND", "postgres")
+        monkeypatch.setenv("AGENTEVALS_DATABASE_URL", "postgresql://from-env/db")
+        monkeypatch.setenv("AGENTEVALS_DATABASE_URL_FILE", str(dsn_file))
+        s = StorageSettings.from_env()
+        assert s.database_url == "postgresql://from-file/db"
+
+    def test_from_env_url_file_unreadable_raises(self, tmp_path, monkeypatch):
+        monkeypatch.setenv("AGENTEVALS_STORAGE_BACKEND", "postgres")
+        monkeypatch.setenv("AGENTEVALS_DATABASE_URL_FILE", str(tmp_path / "missing"))
+        with pytest.raises(ValueError, match="unreadable"):
+            StorageSettings.from_env()
diff --git a/tests/storage/test_memory_repos.py b/tests/storage/test_memory_repos.py
new file mode 100644
index 0000000..cbb1564
--- /dev/null
+++ b/tests/storage/test_memory_repos.py
@@ -0,0 +1,226 @@
+"""MemoryRepos behavior tests.
+
+These exercise the same protocol surface that PostgresRepos implements, so
+the test bodies double as a contract that future tests against a live PG can
+re-use (parametrize the fixture).
+"""
+
+from __future__ import annotations
+
+from datetime import datetime, timedelta, timezone
+from uuid import UUID, uuid4
+
+import pytest
+
+from agentevals.storage.models import Result, ResultStatus, Run, RunSpec, RunStatus, TraceTarget
+from agentevals.storage.repos.memory import MemoryRepos
+
+
+def _make_spec() -> RunSpec:
+    return RunSpec(approach="trace_replay", target=TraceTarget(kind="inline", inline={"data": []}))
+
+
+def _make_run(run_id: UUID | None = None) -> Run:
+    return Run(run_id=run_id or uuid4(), status=RunStatus.QUEUED, spec=_make_spec())
+
+
+@pytest.fixture
+def repos():
+    return MemoryRepos.create()
+
+
+class TestRunRepository:
+    async def test_create_and_get(self, repos):
+        run = _make_run()
+        await repos.runs.create(run)
+        fetched = await repos.runs.get(run.run_id)
+        assert fetched is not None
+        assert fetched.run_id == run.run_id
+        assert fetched.status == RunStatus.QUEUED
+
+    async def test_create_idempotent_returns_existing(self, repos):
+        """Resubmitting the same run_id returns the persisted row, not a new
+        one; this is what makes POST /api/runs idempotent."""
+        run = _make_run()
+        a = await repos.runs.create(run)
+        b = await repos.runs.create(run)
+        assert a.run_id == b.run_id
+        listed = await repos.runs.list()
+        assert len(listed) == 1
+
+    async def test_list_filters_by_status(self, repos):
+        a = _make_run()
+        b = _make_run()
+        await repos.runs.create(a)
+        await repos.runs.create(b)
+        await repos.runs.update_status(a.run_id, RunStatus.SUCCEEDED)
+        succeeded = await repos.runs.list(status=[RunStatus.SUCCEEDED])
+        queued = await repos.runs.list(status=[RunStatus.QUEUED])
+        assert {r.run_id for r in succeeded} == {a.run_id}
+        assert {r.run_id for r in queued} == {b.run_id}
+
+    async def test_list_respects_limit(self, repos):
+        for _ in range(5):
+            await repos.runs.create(_make_run())
+        page = await repos.runs.list(limit=3)
+        assert len(page) == 3
+
+    async def test_claim_next_picks_oldest_queued(self, repos):
+        first = _make_run()
+        second = _make_run()
+        await repos.runs.create(first)
+        await repos.runs.create(second)
+        claimed = await repos.runs.claim_next(worker_id="w1", lease=timedelta(seconds=30), max_attempts=3)
+        assert claimed is not None
+        assert claimed.run_id == first.run_id
+        assert claimed.status == RunStatus.RUNNING
+        assert claimed.attempt == 1
+
+    async def test_claim_next_returns_none_when_empty(self, repos):
+        result = await repos.runs.claim_next(worker_id="w1", lease=timedelta(seconds=30), max_attempts=3)
+        assert result is None
+
+    async def test_claim_respects_max_attempts(self, repos):
+        """A run that has exceeded max_attempts is invisible to claim_next so
+        a poison run cannot starve fresh queued work via repeated re-claims."""
+        run = _make_run()
+        await repos.runs.create(run)
+        for _ in range(3):
+            claimed = await repos.runs.claim_next(worker_id="w1", lease=timedelta(seconds=30), max_attempts=3)
+            if claimed is None:
+                break
+            await repos.runs.update_status(claimed.run_id, RunStatus.QUEUED)
+        # Reset to QUEUED but with attempt=3 already
+        run_now = await repos.runs.get(run.run_id)
+        assert run_now is not None
+        assert run_now.attempt >= 3
+        none_claimed = await repos.runs.claim_next(worker_id="w1", lease=timedelta(seconds=30), max_attempts=3)
+        assert none_claimed is None
+
+    async def test_heartbeat_returns_false_for_unknown_run(self, repos):
+        alive = await repos.runs.heartbeat(uuid4(), "w1", timedelta(seconds=30))
+        assert alive is False
+
+    async def test_heartbeat_returns_false_when_cancel_requested(self, repos):
+        run = _make_run()
+        await repos.runs.create(run)
+        claimed = await repos.runs.claim_next(worker_id="w1", lease=timedelta(seconds=30), max_attempts=3)
+        assert claimed is not None
+        await repos.runs.cancel(claimed.run_id)
+        alive = await repos.runs.heartbeat(claimed.run_id, "w1", timedelta(seconds=30))
+        assert alive is False
+
+    async def test_cancel_queued_run_marks_cancelled(self, repos):
+        run = _make_run()
+        await repos.runs.create(run)
+        ok = await repos.runs.cancel(run.run_id)
+        assert ok is True
+        fresh = await repos.runs.get(run.run_id)
+        assert fresh is not None
+        assert fresh.status == RunStatus.CANCELLED
+
+    async def test_cancel_running_run_sets_flag_only(self, repos):
+        run = _make_run()
+        await repos.runs.create(run)
+        claimed = await repos.runs.claim_next(worker_id="w1", lease=timedelta(seconds=30), max_attempts=3)
+        assert claimed is not None
+        ok = await repos.runs.cancel(claimed.run_id)
+        assert ok is True
+        fresh = await repos.runs.get(claimed.run_id)
+        assert fresh is not None
+        assert fresh.status == RunStatus.RUNNING
+        assert fresh.cancel_requested is True
+
+    async def test_cancel_terminal_run_returns_false(self, repos):
+        run = _make_run()
+        await repos.runs.create(run)
+        await repos.runs.update_status(run.run_id, RunStatus.SUCCEEDED)
+        ok = await repos.runs.cancel(run.run_id)
+        assert ok is False
+
+    async def test_update_status_sets_finished_at_for_terminal(self, repos):
+        run = _make_run()
+        await repos.runs.create(run)
+        await repos.runs.update_status(run.run_id, RunStatus.SUCCEEDED, summary={"k": "v"})
+        fresh = await repos.runs.get(run.run_id)
+        assert fresh is not None
+        assert fresh.finished_at is not None
+        assert fresh.summary == {"k": "v"}
+
+
+class TestResultRepository:
+    def _make_result(self, run_id: UUID, suffix: str = "") -> Result:
+        return Result(
+            result_id=f"hash-{run_id}-{suffix}",
+            run_id=run_id,
+            eval_set_item_id=f"item-{suffix}",
+            eval_set_item_name=f"trace-{suffix}",
+            evaluator_name="m1",
+            evaluator_type="builtin",
+            status=ResultStatus.PASSED,
+            score=0.9,
+        )
+
+    async def test_upsert_many_persists_results(self, repos):
+        run_id = uuid4()
+        results = [self._make_result(run_id, "a"), self._make_result(run_id, "b")]
+        await repos.results.upsert_many(run_id, results)
+        listed = await repos.results.list_by_run(run_id)
+        assert len(listed) == 2
+        assert {r.result_id for r in listed} == {results[0].result_id, results[1].result_id}
+
+    async def test_upsert_many_idempotent_on_result_id(self, repos):
+        """Re-upserting the same result_id replaces the row so retried
+        webhook posts and worker re-execution stay deduplicated."""
+        run_id = uuid4()
+        first = self._make_result(run_id, "a")
+        await repos.results.upsert_many(run_id, [first])
+        first.score = 0.5
+        await repos.results.upsert_many(run_id, [first])
+        listed = await repos.results.list_by_run(run_id)
+        assert len(listed) == 1
+        assert listed[0].score == 0.5
+
+    async def test_empty_upsert_is_noop(self, repos):
+        run_id = uuid4()
+        await repos.results.upsert_many(run_id, [])
+        listed = await repos.results.list_by_run(run_id)
+        assert listed == []
+
+    async def test_delete_by_run(self, repos):
+        run_id = uuid4()
+        await repos.results.upsert_many(run_id, [self._make_result(run_id, "a")])
+        await repos.results.delete_by_run(run_id)
+        assert await repos.results.list_by_run(run_id) == []
+
+
+class TestSessionRepository:
+    """SessionRepository is forward-compat scaffolding in this slice; cover
+    the basic CRUD surface so regressions surface if the protocol drifts."""
+
+    async def test_upsert_and_get(self, repos):
+        from agentevals.streaming.session import TraceSession
+
+        s = TraceSession(session_id="sess-1", trace_id="t-1", eval_set_id=None)
+        s.trace_ids.add("t-1")
+        await repos.sessions.upsert(s)
+        fetched = await repos.sessions.get("sess-1")
+        assert fetched is not None
+        assert fetched.session_id == "sess-1"
+
+    async def test_find_by_trace_id(self, repos):
+        from agentevals.streaming.session import TraceSession
+
+        s = TraceSession(session_id="sess-1", trace_id="t-1", eval_set_id=None)
+        s.trace_ids.update({"t-1", "t-2"})
+        await repos.sessions.upsert(s)
+        match = await repos.sessions.find_by_trace_id("t-2")
+        assert match is not None
+        assert match.session_id == "sess-1"
+
+    async def test_delete(self, repos):
+        from agentevals.streaming.session import TraceSession
+
+        await repos.sessions.upsert(TraceSession(session_id="sess-1", trace_id="t-1", eval_set_id=None))
+        await repos.sessions.delete("sess-1")
+        assert await repos.sessions.get("sess-1") is None
diff --git a/tests/storage/test_migrator.py b/tests/storage/test_migrator.py
new file mode 100644
index 0000000..e6f1d90
--- /dev/null
+++ b/tests/storage/test_migrator.py
@@ -0,0 +1,133 @@
+"""Migration runner tests.
+
+The pure helpers (file discovery + schema substitution) are tested directly.
+Live PG behavior is tested only when AGENTEVALS_TEST_DATABASE_URL is set;
+otherwise those tests skip so the suite stays runnable in pure-Python sandboxes.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+
+import pytest
+
+from agentevals.storage.postgres.migrator import (
+    ADVISORY_LOCK_KEY,
+    Migration,
+    Migrator,
+    _apply_schema,
+    _discover_migrations,
+    discover_migrations,
+)
+
+
+class TestDiscoverMigrations:
+    def test_finds_baseline(self):
+        migrations = _discover_migrations()
+        assert len(migrations) >= 1
+        first = migrations[0]
+        assert first.version == 1
+        assert first.name == "init"
+        assert first.up_sql.strip()
+        assert first.down_sql is not None and first.down_sql.strip()
+
+    def test_versions_sorted(self):
+        migrations = _discover_migrations()
+        versions = [m.version for m in migrations]
+        assert versions == sorted(versions)
+
+    def test_public_alias_matches(self):
+        assert [m.version for m in discover_migrations()] == [m.version for m in _discover_migrations()]
+
+
+class TestApplySchema:
+    def test_substitutes_placeholder(self):
+        sql = "CREATE TABLE {schema}.foo (id INT)"
+        assert _apply_schema(sql, "agentevals") == "CREATE TABLE agentevals.foo (id INT)"
+
+    def test_collapses_doubled_braces(self):
+        """Doubled braces in SQL literals (e.g. JSONB defaults like '{{}}')
+        collapse to single braces after the {schema} substitution; this
+        keeps SQL files readable while letting the placeholder expand."""
+        sql = "metadata JSONB NOT NULL DEFAULT '{{}}'"
+        assert _apply_schema(sql, "agentevals") == "metadata JSONB NOT NULL DEFAULT '{}'"
+
+    def test_supports_custom_schema(self):
+        sql = "CREATE TABLE {schema}.foo (id INT)"
+        assert _apply_schema(sql, "myteam") == "CREATE TABLE myteam.foo (id INT)"
+
+    def test_rejects_non_identifier_schema(self):
+        """Defense against SQL injection via schema name. Schema is taken
+        from an env var which an operator controls but a future bug could
+        plumb in untrusted input; the regex stops anything but a SQL identifier."""
+        with pytest.raises(ValueError, match="invalid schema"):
+            _apply_schema("CREATE TABLE {schema}.foo", "drop; DROP TABLE users")
+
+    def test_rejects_quoted_schema(self):
+        with pytest.raises(ValueError, match="invalid schema"):
+            _apply_schema("X", '"agentevals"')
+
+
+class TestAdvisoryLockKey:
+    def test_fits_int8(self):
+        """pg_try_advisory_lock requires an int8; a key wider than that
+        wraps silently and would collide unpredictably. Lock key chosen at
+        random; this test only guards against future drift."""
+        assert -(2**63) <= ADVISORY_LOCK_KEY < 2**63
+
+    def test_stable(self):
+        """Changing the lock key would let two concurrent migrators race.
+        Only update the key alongside an explicit migration to a new key."""
+        assert ADVISORY_LOCK_KEY == 7259820376655812345
+
+
+class TestMigrationFilePattern:
+    def test_filename_format(self):
+        migrations = _discover_migrations()
+        for m in migrations:
+            assert isinstance(m, Migration)
+            assert re.match(r"^[a-z0-9_]+$", m.name)
+            assert m.version > 0
+
+
+@pytest.mark.skipif(
+    not os.environ.get("AGENTEVALS_TEST_DATABASE_URL"),
+    reason="requires AGENTEVALS_TEST_DATABASE_URL pointing at a disposable Postgres",
+)
+class TestMigratorLive:
+    """Apply / no-op replay / version / force / down — all against a real PG.
+
+    Each test creates and drops its own schema so they can run in any order
+    against the same database without interfering.
+    """
+
+    @pytest.fixture
+    async def migrator(self):
+        dsn = os.environ["AGENTEVALS_TEST_DATABASE_URL"]
+        schema = "agentevals_test_migrator"
+        m = Migrator(dsn=dsn, schema=schema, lock_timeout_s=10)
+        yield m
+        # cleanup
+        try:
+            await m.down(steps=1)
+        except Exception:
+            pass
+
+    async def test_up_then_replay_is_noop(self, migrator):
+        applied = await migrator.up()
+        assert applied == [1]
+        again = await migrator.up()
+        assert again == []
+
+    async def test_version_after_up(self, migrator):
+        await migrator.up()
+        status = await migrator.status()
+        assert status.version == 1
+        assert status.dirty is False
+
+    async def test_force_clears_dirty(self, migrator):
+        await migrator.up()
+        await migrator.force(version=1)
+        status = await migrator.status()
+        assert status.dirty is False
diff --git a/tests/storage/test_models.py b/tests/storage/test_models.py
new file mode 100644
index 0000000..59629d0
--- /dev/null
+++ b/tests/storage/test_models.py
@@ -0,0 +1,183 @@
+"""Storage model unit tests: pure functions, validation, MetricResult mapping."""
+
+from __future__ import annotations
+
+import hashlib
+from uuid import UUID
+
+import pytest
+
+from agentevals.runner import MetricResult
+from agentevals.storage.models import (
+    Result,
+    ResultStatus,
+    Run,
+    RunSpec,
+    RunStatus,
+    TraceTarget,
+    compute_result_id,
+)
+
+
+class TestComputeResultId:
+    def test_deterministic(self):
+        a = compute_result_id("00000000-0000-0000-0000-000000000001", "item-x", "metric-y")
+        b = compute_result_id("00000000-0000-0000-0000-000000000001", "item-x", "metric-y")
+        assert a == b
+
+    def test_uuid_lowercased(self):
+        upper = compute_result_id("00000000-0000-0000-0000-00000000ABCD", "item", "m")
+        lower = compute_result_id("00000000-0000-0000-0000-00000000abcd", "item", "m")
+        assert upper == lower
+
+    def test_uuid_object_and_string_match(self):
+        u = UUID("00000000-0000-0000-0000-000000000001")
+        assert compute_result_id(u, "item", "m") == compute_result_id(str(u), "item", "m")
+
+    def test_pipe_delimiter_byte_spec(self):
+        """Locks the canonical formula so producer (Python) and any future
+        consumer agree byte-for-byte. Any change here is a breaking change."""
+        expected = hashlib.sha256(b"abc|item|m").hexdigest()
+        assert compute_result_id("abc", "item", "m") == expected
+
+
+class TestTraceTargetValidation:
+    def test_inline(self):
+        t = TraceTarget(kind="inline", inline={"data": []})
+        assert t.kind == "inline"
+
+    def test_http_with_base_url(self):
+        t = TraceTarget(kind="http", base_url="https://example/", trace_id="abc")
+        assert t.base_url == "https://example/"
+        assert t.trace_id == "abc"
+
+    def test_uploaded_with_audit_metadata(self):
+        t = TraceTarget(kind="uploaded", trace_count=2, trace_files=["a.json", "b.json"])
+        assert t.kind == "uploaded"
+        assert t.trace_count == 2
+        assert t.trace_files == ["a.json", "b.json"]
+
+    def test_unknown_kind_rejected(self):
+        from pydantic import ValidationError
+
+        with pytest.raises(ValidationError):
+            TraceTarget(kind="not-a-kind")
+
+
+class TestRunSpec:
+    def test_minimal_inline_spec(self):
+        spec = RunSpec(approach="trace_replay", target=TraceTarget(kind="inline", inline={}))
+        assert spec.approach == "trace_replay"
+        assert spec.target.kind == "inline"
+        assert spec.eval_set is None
+        assert spec.eval_config == {}
+        assert spec.sinks == []
+        assert spec.context == {}
+
+    def test_extra_fields_allowed_for_forward_compat(self):
+        """RunSpec uses extra='allow' so a host can include forward-compatible
+        metadata without breaking older agentevals replicas."""
+        spec = RunSpec.model_validate(
+            {
+                "approach": "trace_replay",
+                "target": {"kind": "inline", "inline": {}},
+                "futureField": "unknown",
+            }
+        )
+        assert spec.target.kind == "inline"
+
+
+class TestResultFromMetricResult:
+    """Locks the renaming + status-mapping behavior between the in-pipeline
+    MetricResult shape and the persisted Result shape."""
+
+    def _mr(self, **overrides):
+        defaults = dict(
+            metric_name="tool_trajectory_avg_score",
+            score=0.8,
+            eval_status="PASSED",
+            per_invocation_scores=[1.0, 0.6],
+            error=None,
+            details={"foo": "bar"},
+            duration_ms=42.5,
+        )
+        defaults.update(overrides)
+        return MetricResult(**defaults)
+
+    def _build(self, mr):
+        return Result.from_metric_result(
+            run_id=UUID("00000000-0000-0000-0000-000000000001"),
+            eval_set_item_id="item-1",
+            eval_set_item_name="trace-abc",
+            trace_id="trace-abc",
+            evaluator_type="builtin",
+            metric_result=mr,
+        )
+
+    def test_passed_maps_to_passed(self):
+        r = self._build(self._mr(eval_status="PASSED"))
+        assert r.status == ResultStatus.PASSED
+        assert r.score == 0.8
+        assert r.evaluator_name == "tool_trajectory_avg_score"
+        assert r.evaluator_type == "builtin"
+        assert r.eval_set_item_id == "item-1"
+        assert r.trace_id == "trace-abc"
+
+    def test_failed_maps_to_failed(self):
+        r = self._build(self._mr(eval_status="FAILED"))
+        assert r.status == ResultStatus.FAILED
+
+    def test_not_evaluated_maps_to_skipped(self):
+        r = self._build(self._mr(eval_status="NOT_EVALUATED", score=None, per_invocation_scores=[]))
+        assert r.status == ResultStatus.SKIPPED
+
+    def test_unknown_status_maps_to_skipped(self):
+        """Defensive: ADK sometimes emits non-standard status strings;
+        anything unknown should land as skipped, not crash."""
+        r = self._build(self._mr(eval_status="MAYBE_PASSED"))
+        assert r.status == ResultStatus.SKIPPED
+
+    def test_error_dominates_status(self):
+        """Even if eval_status says PASSED, a non-empty error means
+        the row should land as 'errored' so downstream consumers can
+        filter cleanly without special-casing the error field."""
+        r = self._build(self._mr(eval_status="PASSED", error="boom"))
+        assert r.status == ResultStatus.ERRORED
+        assert r.error_text == "boom"
+
+    def test_duration_ms_renamed_to_latency_ms(self):
+        r = self._build(self._mr(duration_ms=42.7))
+        assert r.latency_ms == 42  # int truncation matches the schema column type
+
+    def test_latency_ms_none_when_duration_missing(self):
+        r = self._build(self._mr(duration_ms=None))
+        assert r.latency_ms is None
+
+    def test_per_invocation_scores_preserved(self):
+        r = self._build(self._mr(per_invocation_scores=[0.0, 0.5, 1.0]))
+        assert r.per_invocation_scores == [0.0, 0.5, 1.0]
+
+    def test_details_default_to_empty_dict(self):
+        r = self._build(self._mr(details=None))
+        assert r.details == {}
+
+    def test_result_id_matches_canonical_formula(self):
+        r = self._build(self._mr())
+        expected = compute_result_id(
+            UUID("00000000-0000-0000-0000-000000000001"),
+            "item-1",
+            "tool_trajectory_avg_score",
+        )
+        assert r.result_id == expected
+
+
+class TestRun:
+    def test_default_status_and_attempt(self):
+        run = Run(
+            run_id=UUID("00000000-0000-0000-0000-000000000001"),
+            status=RunStatus.QUEUED,
+            spec=RunSpec(approach="trace_replay", target=TraceTarget(kind="inline", inline={})),
+        )
+        assert run.attempt == 0
+        assert run.cancel_requested is False
+        assert run.error is None
diff --git a/uv.lock b/uv.lock
index 8a573ec..2d42db3 100644
--- a/uv.lock
+++ b/uv.lock
@@ -53,6 +53,9 @@ live = [
 openai = [
     { name = "openai" },
 ]
+postgres = [
+    { name = "asyncpg" },
+]
 streaming = [
     { name = "opentelemetry-sdk" },
     { name = "websockets" },
@@ -80,6 +83,7 @@ e2e = [
 
 [package.metadata]
 requires-dist = [
+    { name = "asyncpg", marker = "extra == 'postgres'", specifier = ">=0.30.0" },
     { name = "click", specifier = ">=8.0" },
     { name = "fastapi", specifier = ">=0.115.0" },
     { name = "google-adk", extras = ["eval"], specifier = ">=1.30.0" },
@@ -96,7 +100,7 @@ requires-dist = [
     { name = "uvicorn", extras = ["standard"], specifier = ">=0.32.0" },
     { name = "websockets", marker = "extra == 'streaming'", specifier = ">=12.0" },
 ]
-provides-extras = ["live", "streaming", "openai"]
+provides-extras = ["live", "streaming", "openai", "postgres"]
 
 [package.metadata.requires-dev]
 dev = [
@@ -307,6 +311,54 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/38/0e/27be9fdef66e72d64c0cdc3cc2823101b80585f8119b5c112c2e8f5f7dab/anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c", size = 113592, upload-time = "2026-01-06T11:45:19.497Z" },
 ]
 
+[[package]]
+name = "asyncpg"
+version = "0.31.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fe/cc/d18065ce2380d80b1bcce927c24a2642efd38918e33fd724bc4bca904877/asyncpg-0.31.0.tar.gz", hash = "sha256:c989386c83940bfbd787180f2b1519415e2d3d6277a70d9d0f0145ac73500735", size = 993667, upload-time = "2025-11-24T23:27:00.812Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/08/17/cc02bc49bc350623d050fa139e34ea512cd6e020562f2a7312a7bcae4bc9/asyncpg-0.31.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:eee690960e8ab85063ba93af2ce128c0f52fd655fdff9fdb1a28df01329f031d", size = 643159, upload-time = "2025-11-24T23:25:36.443Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/62/4ded7d400a7b651adf06f49ea8f73100cca07c6df012119594d1e3447aa6/asyncpg-0.31.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2657204552b75f8288de08ca60faf4a99a65deef3a71d1467454123205a88fab", size = 638157, upload-time = "2025-11-24T23:25:37.89Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/5b/4179538a9a72166a0bf60ad783b1ef16efb7960e4d7b9afe9f77a5551680/asyncpg-0.31.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a429e842a3a4b4ea240ea52d7fe3f82d5149853249306f7ff166cb9948faa46c", size = 2918051, upload-time = "2025-11-24T23:25:39.461Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/35/c27719ae0536c5b6e61e4701391ffe435ef59539e9360959240d6e47c8c8/asyncpg-0.31.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c0807be46c32c963ae40d329b3a686356e417f674c976c07fa49f1b30303f109", size = 2972640, upload-time = "2025-11-24T23:25:41.512Z" },
+    { url = "https://files.pythonhosted.org/packages/43/f4/01ebb9207f29e645a64699b9ce0eefeff8e7a33494e1d29bb53736f7766b/asyncpg-0.31.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e5d5098f63beeae93512ee513d4c0c53dc12e9aa2b7a1af5a81cddf93fe4e4da", size = 2851050, upload-time = "2025-11-24T23:25:43.153Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/f4/03ff1426acc87be0f4e8d40fa2bff5c3952bef0080062af9efc2212e3be8/asyncpg-0.31.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37fc6c00a814e18eef51833545d1891cac9aa69140598bb076b4cd29b3e010b9", size = 2962574, upload-time = "2025-11-24T23:25:44.942Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/39/cc788dfca3d4060f9d93e67be396ceec458dfc429e26139059e58c2c244d/asyncpg-0.31.0-cp311-cp311-win32.whl", hash = "sha256:5a4af56edf82a701aece93190cc4e094d2df7d33f6e915c222fb09efbb5afc24", size = 521076, upload-time = "2025-11-24T23:25:46.486Z" },
+    { url = "https://files.pythonhosted.org/packages/28/fc/735af5384c029eb7f1ca60ccb8fa95521dbdaeef788edf4cecfc604c3cab/asyncpg-0.31.0-cp311-cp311-win_amd64.whl", hash = "sha256:480c4befbdf079c14c9ca43c8c5e1fe8b6296c96f1f927158d4f1e750aacc047", size = 584980, upload-time = "2025-11-24T23:25:47.938Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/a6/59d0a146e61d20e18db7396583242e32e0f120693b67a8de43f1557033e2/asyncpg-0.31.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b44c31e1efc1c15188ef183f287c728e2046abb1d26af4d20858215d50d91fad", size = 662042, upload-time = "2025-11-24T23:25:49.578Z" },
+    { url = "https://files.pythonhosted.org/packages/36/01/ffaa189dcb63a2471720615e60185c3f6327716fdc0fc04334436fbb7c65/asyncpg-0.31.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0c89ccf741c067614c9b5fc7f1fc6f3b61ab05ae4aaa966e6fd6b93097c7d20d", size = 638504, upload-time = "2025-11-24T23:25:51.501Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/62/3f699ba45d8bd24c5d65392190d19656d74ff0185f42e19d0bbd973bb371/asyncpg-0.31.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:12b3b2e39dc5470abd5e98c8d3373e4b1d1234d9fbdedf538798b2c13c64460a", size = 3426241, upload-time = "2025-11-24T23:25:53.278Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/d1/a867c2150f9c6e7af6462637f613ba67f78a314b00db220cd26ff559d532/asyncpg-0.31.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:aad7a33913fb8bcb5454313377cc330fbb19a0cd5faa7272407d8a0c4257b671", size = 3520321, upload-time = "2025-11-24T23:25:54.982Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/1a/cce4c3f246805ecd285a3591222a2611141f1669d002163abef999b60f98/asyncpg-0.31.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3df118d94f46d85b2e434fd62c84cb66d5834d5a890725fe625f498e72e4d5ec", size = 3316685, upload-time = "2025-11-24T23:25:57.43Z" },
+    { url = "https://files.pythonhosted.org/packages/40/ae/0fc961179e78cc579e138fad6eb580448ecae64908f95b8cb8ee2f241f67/asyncpg-0.31.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bd5b6efff3c17c3202d4b37189969acf8927438a238c6257f66be3c426beba20", size = 3471858, upload-time = "2025-11-24T23:25:59.636Z" },
+    { url = "https://files.pythonhosted.org/packages/52/b2/b20e09670be031afa4cbfabd645caece7f85ec62d69c312239de568e058e/asyncpg-0.31.0-cp312-cp312-win32.whl", hash = "sha256:027eaa61361ec735926566f995d959ade4796f6a49d3bde17e5134b9964f9ba8", size = 527852, upload-time = "2025-11-24T23:26:01.084Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/f0/f2ed1de154e15b107dc692262395b3c17fc34eafe2a78fc2115931561730/asyncpg-0.31.0-cp312-cp312-win_amd64.whl", hash = "sha256:72d6bdcbc93d608a1158f17932de2321f68b1a967a13e014998db87a72ed3186", size = 597175, upload-time = "2025-11-24T23:26:02.564Z" },
+    { url = "https://files.pythonhosted.org/packages/95/11/97b5c2af72a5d0b9bc3fa30cd4b9ce22284a9a943a150fdc768763caf035/asyncpg-0.31.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c204fab1b91e08b0f47e90a75d1b3c62174dab21f670ad6c5d0f243a228f015b", size = 661111, upload-time = "2025-11-24T23:26:04.467Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/71/157d611c791a5e2d0423f09f027bd499935f0906e0c2a416ce712ba51ef3/asyncpg-0.31.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:54a64f91839ba59008eccf7aad2e93d6e3de688d796f35803235ea1c4898ae1e", size = 636928, upload-time = "2025-11-24T23:26:05.944Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/fc/9e3486fb2bbe69d4a867c0b76d68542650a7ff1574ca40e84c3111bb0c6e/asyncpg-0.31.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c0e0822b1038dc7253b337b0f3f676cadc4ac31b126c5d42691c39691962e403", size = 3424067, upload-time = "2025-11-24T23:26:07.957Z" },
+    { url = "https://files.pythonhosted.org/packages/12/c6/8c9d076f73f07f995013c791e018a1cd5f31823c2a3187fc8581706aa00f/asyncpg-0.31.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bef056aa502ee34204c161c72ca1f3c274917596877f825968368b2c33f585f4", size = 3518156, upload-time = "2025-11-24T23:26:09.591Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/3b/60683a0baf50fbc546499cfb53132cb6835b92b529a05f6a81471ab60d0c/asyncpg-0.31.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0bfbcc5b7ffcd9b75ab1558f00db2ae07db9c80637ad1b2469c43df79d7a5ae2", size = 3319636, upload-time = "2025-11-24T23:26:11.168Z" },
+    { url = "https://files.pythonhosted.org/packages/50/dc/8487df0f69bd398a61e1792b3cba0e47477f214eff085ba0efa7eac9ce87/asyncpg-0.31.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:22bc525ebbdc24d1261ecbf6f504998244d4e3be1721784b5f64664d61fbe602", size = 3472079, upload-time = "2025-11-24T23:26:13.164Z" },
+    { url = "https://files.pythonhosted.org/packages/13/a1/c5bbeeb8531c05c89135cb8b28575ac2fac618bcb60119ee9696c3faf71c/asyncpg-0.31.0-cp313-cp313-win32.whl", hash = "sha256:f890de5e1e4f7e14023619399a471ce4b71f5418cd67a51853b9910fdfa73696", size = 527606, upload-time = "2025-11-24T23:26:14.78Z" },
+    { url = "https://files.pythonhosted.org/packages/91/66/b25ccb84a246b470eb943b0107c07edcae51804912b824054b3413995a10/asyncpg-0.31.0-cp313-cp313-win_amd64.whl", hash = "sha256:dc5f2fa9916f292e5c5c8b2ac2813763bcd7f58e130055b4ad8a0531314201ab", size = 596569, upload-time = "2025-11-24T23:26:16.189Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/36/e9450d62e84a13aea6580c83a47a437f26c7ca6fa0f0fd40b6670793ea30/asyncpg-0.31.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:f6b56b91bb0ffc328c4e3ed113136cddd9deefdf5f79ab448598b9772831df44", size = 660867, upload-time = "2025-11-24T23:26:17.631Z" },
+    { url = "https://files.pythonhosted.org/packages/82/4b/1d0a2b33b3102d210439338e1beea616a6122267c0df459ff0265cd5807a/asyncpg-0.31.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:334dec28cf20d7f5bb9e45b39546ddf247f8042a690bff9b9573d00086e69cb5", size = 638349, upload-time = "2025-11-24T23:26:19.689Z" },
+    { url = "https://files.pythonhosted.org/packages/41/aa/e7f7ac9a7974f08eff9183e392b2d62516f90412686532d27e196c0f0eeb/asyncpg-0.31.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:98cc158c53f46de7bb677fd20c417e264fc02b36d901cc2a43bd6cb0dc6dbfd2", size = 3410428, upload-time = "2025-11-24T23:26:21.275Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/de/bf1b60de3dede5c2731e6788617a512bc0ebd9693eac297ee74086f101d7/asyncpg-0.31.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9322b563e2661a52e3cdbc93eed3be7748b289f792e0011cb2720d278b366ce2", size = 3471678, upload-time = "2025-11-24T23:26:23.627Z" },
+    { url = "https://files.pythonhosted.org/packages/46/78/fc3ade003e22d8bd53aaf8f75f4be48f0b460fa73738f0391b9c856a9147/asyncpg-0.31.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:19857a358fc811d82227449b7ca40afb46e75b33eb8897240c3839dd8b744218", size = 3313505, upload-time = "2025-11-24T23:26:25.235Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/e9/73eb8a6789e927816f4705291be21f2225687bfa97321e40cd23055e903a/asyncpg-0.31.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ba5f8886e850882ff2c2ace5732300e99193823e8107e2c53ef01c1ebfa1e85d", size = 3434744, upload-time = "2025-11-24T23:26:26.944Z" },
+    { url = "https://files.pythonhosted.org/packages/08/4b/f10b880534413c65c5b5862f79b8e81553a8f364e5238832ad4c0af71b7f/asyncpg-0.31.0-cp314-cp314-win32.whl", hash = "sha256:cea3a0b2a14f95834cee29432e4ddc399b95700eb1d51bbc5bfee8f31fa07b2b", size = 532251, upload-time = "2025-11-24T23:26:28.404Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/2d/7aa40750b7a19efa5d66e67fc06008ca0f27ba1bd082e457ad82f59aba49/asyncpg-0.31.0-cp314-cp314-win_amd64.whl", hash = "sha256:04d19392716af6b029411a0264d92093b6e5e8285ae97a39957b9a9c14ea72be", size = 604901, upload-time = "2025-11-24T23:26:30.34Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/fe/b9dfe349b83b9dee28cc42360d2c86b2cdce4cb551a2c2d27e156bcac84d/asyncpg-0.31.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:bdb957706da132e982cc6856bb2f7b740603472b54c3ebc77fe60ea3e57e1bd2", size = 702280, upload-time = "2025-11-24T23:26:32Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/81/e6be6e37e560bd91e6c23ea8a6138a04fd057b08cf63d3c5055c98e81c1d/asyncpg-0.31.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:6d11b198111a72f47154fa03b85799f9be63701e068b43f84ac25da0bda9cb31", size = 682931, upload-time = "2025-11-24T23:26:33.572Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/45/6009040da85a1648dd5bc75b3b0a062081c483e75a1a29041ae63a0bf0dc/asyncpg-0.31.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:18c83b03bc0d1b23e6230f5bf8d4f217dc9bc08644ce0502a9d91dc9e634a9c7", size = 3581608, upload-time = "2025-11-24T23:26:35.638Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/06/2e3d4d7608b0b2b3adbee0d0bd6a2d29ca0fc4d8a78f8277df04e2d1fd7b/asyncpg-0.31.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e009abc333464ff18b8f6fd146addffd9aaf63e79aa3bb40ab7a4c332d0c5e9e", size = 3498738, upload-time = "2025-11-24T23:26:37.275Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/aa/7d75ede780033141c51d83577ea23236ba7d3a23593929b32b49db8ed36e/asyncpg-0.31.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3b1fbcb0e396a5ca435a8826a87e5c2c2cc0c8c68eb6fadf82168056b0e53a8c", size = 3401026, upload-time = "2025-11-24T23:26:39.423Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/7a/15e37d45e7f7c94facc1e9148c0e455e8f33c08f0b8a0b1deb2c5171771b/asyncpg-0.31.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8df714dba348efcc162d2adf02d213e5fab1bd9f557e1305633e851a61814a7a", size = 3429426, upload-time = "2025-11-24T23:26:41.032Z" },
+    { url = "https://files.pythonhosted.org/packages/13/d5/71437c5f6ae5f307828710efbe62163974e71237d5d46ebd2869ea052d10/asyncpg-0.31.0-cp314-cp314t-win32.whl", hash = "sha256:1b41f1afb1033f2b44f3234993b15096ddc9cd71b21a42dbd87fc6a57b43d65d", size = 614495, upload-time = "2025-11-24T23:26:42.659Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/d7/8fb3044eaef08a310acfe23dae9a8e2e07d305edc29a53497e52bc76eca7/asyncpg-0.31.0-cp314-cp314t-win_amd64.whl", hash = "sha256:bd4107bb7cdd0e9e65fae66a62afd3a249663b844fa34d479f6d5b3bef9c04c3", size = 706062, upload-time = "2025-11-24T23:26:44.086Z" },
+]
+
 [[package]]
 name = "attrs"
 version = "25.4.0"