From 3d1195b2f859bf78ea1f918d01c5d73f054477de Mon Sep 17 00:00:00 2001
From: Rory Byrne <rory@rory.bio>
Date: Sun, 5 Apr 2026 19:38:07 +0100
Subject: [PATCH 01/13] feat: add database migration for deliver_after column
 and batches_failed tracking

This migration adds support for explicit delivery scheduling and batch failure accounting:

- Add deliver_after column to deliveries table for backoff scheduling
- Add batches_failed column to ingest_runs table for failure tracking
- Create index on deliver_after for efficient pending delivery queries

feat: replace IngestStarted with IngestRunStarted and NextBatchRequested events

Split the single IngestStarted event into two distinct events:
- IngestRunStarted for observability/audit purposes
- NextBatchRequested for triggering ingester batch pulls

This separation clarifies event semantics and handler responsibilities.

feat: add comprehensive error handling with TransientError and PermanentError

Introduce structured error classification for runner failures:
- TransientError for retryable failures (timeouts, resource pressure)
- PermanentError for unrecoverable failures (config errors, image pull)
- OOMError as specialized PermanentError for memory exhaustion

feat: implement batch failure accounting in ingest service

Add methods to handle permanent batch and ingestion failures:
- fail_batch() for hook processing failures
- fail_ingestion() for ingester pull failures
- Update completion logic to account for failed batches

feat: enhance worker retry logic with exponential backoff

Implement sophisticated retry handling:
- Exponential backoff for transient failures
- Immediate failure for permanent errors
- deliver_after timestamp for deferred retry scheduling
- Call handler.on_exhausted() when retries are exhausted

refactor: remove HookStatus.FAILED and HookStatus.OOM

Replace status-based failure reporting with exception-based error handling:
- Hook failures now raise appropriate error types
- Simplify validation run status calculation
- Remove oom_killed property from HookResult

refactor: update K8s and OCI runners to use new error types

Convert all runner implementations to raise structured exceptions:
- Map container exit codes and OOM kills to appropriate error types
- Remove result-based error reporting
- Simplify job completion logic

feat: add on_exhausted callback to EventHandler

Allow handlers to perform cleanup when events cannot be processed:
- Called when retry limit is reached or permanent failure occurs
- Default implementation is no-op for backward compatibility
---
 .../migrations/versions/add_deliver_after.py  |  45 ++++++
 server/osa/domain/ingest/event/__init__.py    |   6 +-
 server/osa/domain/ingest/event/events.py      |  17 +-
 server/osa/domain/ingest/handler/run_hooks.py |  38 ++++-
 .../osa/domain/ingest/handler/run_ingester.py |  50 ++++--
 server/osa/domain/ingest/model/ingest_run.py  |   8 +-
 server/osa/domain/ingest/port/repository.py   |   8 +
 server/osa/domain/ingest/service/ingest.py    |  52 ++++++-
 server/osa/domain/shared/error.py             |  12 ++
 server/osa/domain/shared/event.py             |  11 ++
 server/osa/domain/shared/outbox.py            |   8 +-
 .../domain/shared/port/event_repository.py    |   2 +
 server/osa/domain/validation/model/entity.py  |   2 -
 .../domain/validation/model/hook_result.py    |   7 -
 server/osa/domain/validation/service/hook.py  |  42 ++---
 .../domain/validation/service/validation.py   |   9 +-
 server/osa/infrastructure/event/worker.py     |  87 +++++++++--
 server/osa/infrastructure/ingest/di.py        |   4 +-
 server/osa/infrastructure/k8s/errors.py       |  16 +-
 .../osa/infrastructure/k8s/ingester_runner.py | 147 +++++++-----------
 server/osa/infrastructure/k8s/runner.py       | 145 ++++++-----------
 .../osa/infrastructure/oci/ingester_runner.py |  30 ++--
 server/osa/infrastructure/oci/runner.py       |  32 +---
 .../persistence/repository/event.py           |  32 ++--
 .../persistence/repository/ingest.py          |  20 +++
 .../osa/infrastructure/persistence/tables.py  |   9 ++
 .../unit/domain/ingest/test_ingest_run.py     |  71 +++++++++
 .../unit/domain/ingest/test_ingest_service.py |  22 ++-
 .../unit/domain/shared/test_error_types.py    |  45 ++++++
 .../unit/domain/shared/test_outbox_claim.py   |   2 +-
 .../shared/test_outbox_consumer_groups.py     |   2 +-
 .../domain/validation/test_hook_result.py     |  60 +------
 .../domain/validation/test_hook_service.py    |  48 +++---
 .../validation/test_validation_service.py     |  14 +-
 .../unit/infrastructure/event/test_worker.py  |  11 +-
 .../k8s/test_classify_api_error.py            |  26 ++--
 .../k8s/test_k8s_hook_runner.py               | 129 ++++++++-------
 .../k8s/test_k8s_ingester_runner.py           |  53 ++-----
 .../infrastructure/test_oci_hook_runner.py    |  29 ++--
 39 files changed, 788 insertions(+), 563 deletions(-)
 create mode 100644 server/migrations/versions/add_deliver_after.py
 create mode 100644 server/tests/unit/domain/shared/test_error_types.py

diff --git a/server/migrations/versions/add_deliver_after.py b/server/migrations/versions/add_deliver_after.py
new file mode 100644
index 0000000..c7a7b6b
--- /dev/null
+++ b/server/migrations/versions/add_deliver_after.py
@@ -0,0 +1,45 @@
+"""add_deliver_after_and_batches_failed
+
+Add deliver_after column to deliveries table for explicit backoff scheduling.
+Add batches_failed column to ingest_runs table for batch failure accounting.
+
+Revision ID: add_deliver_after
+Revises: add_ingest_runs
+Create Date: 2026-04-04
+
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "add_deliver_after"
+down_revision: Union[str, Sequence[str], None] = "add_ingest_runs"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "deliveries",
+        sa.Column("deliver_after", sa.DateTime(timezone=True), nullable=True),
+    )
+    op.create_index(
+        "idx_deliveries_deliver_after",
+        "deliveries",
+        ["deliver_after"],
+        postgresql_where=sa.text("status = 'pending'"),
+    )
+
+    op.add_column(
+        "ingest_runs",
+        sa.Column("batches_failed", sa.Integer, nullable=False, server_default=sa.text("0")),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("ingest_runs", "batches_failed")
+    op.drop_index("idx_deliveries_deliver_after", table_name="deliveries")
+    op.drop_column("deliveries", "deliver_after")
diff --git a/server/osa/domain/ingest/event/__init__.py b/server/osa/domain/ingest/event/__init__.py
index c8b3101..b10432b 100644
--- a/server/osa/domain/ingest/event/__init__.py
+++ b/server/osa/domain/ingest/event/__init__.py
@@ -4,12 +4,14 @@
     HookBatchCompleted,
     IngestBatchPublished,
     IngestCompleted,
-    IngestStarted,
+    IngestRunStarted,
     IngesterBatchReady,
+    NextBatchRequested,
 )
 
 __all__ = [
-    "IngestStarted",
+    "IngestRunStarted",
+    "NextBatchRequested",
     "IngesterBatchReady",
     "HookBatchCompleted",
     "IngestBatchPublished",
diff --git a/server/osa/domain/ingest/event/events.py b/server/osa/domain/ingest/event/events.py
index ee0eaab..7b14e51 100644
--- a/server/osa/domain/ingest/event/events.py
+++ b/server/osa/domain/ingest/event/events.py
@@ -3,8 +3,21 @@
 from osa.domain.shared.event import Event, EventId
 
 
-class IngestStarted(Event):
-    """Emitted when an ingest run is created. Triggers first ingester pull."""
+class IngestRunStarted(Event):
+    """Emitted once when an ingest run is created. Observability/audit only."""
+
+    id: EventId
+    ingest_run_srn: str
+    convention_srn: str
+    batch_size: int
+
+
+class NextBatchRequested(Event):
+    """Emitted to trigger the next ingester batch pull.
+
+    Emitted by StartIngest (first batch) and by RunIngester (continuation).
+    RunIngester is the only handler that listens to this event.
+    """
 
     id: EventId
     ingest_run_srn: str
diff --git a/server/osa/domain/ingest/handler/run_hooks.py b/server/osa/domain/ingest/handler/run_hooks.py
index 63eb205..18f82de 100644
--- a/server/osa/domain/ingest/handler/run_hooks.py
+++ b/server/osa/domain/ingest/handler/run_hooks.py
@@ -8,7 +8,8 @@
 from osa.domain.ingest.model.ingester_record import IngesterRecord
 from osa.domain.ingest.port.repository import IngestRunRepository
 from osa.domain.ingest.port.storage import IngestStoragePort
-from osa.domain.shared.error import NotFoundError
+from osa.domain.ingest.service.ingest import IngestService
+from osa.domain.shared.error import NotFoundError, PermanentError
 from osa.domain.shared.event import EventHandler, EventId
 from osa.domain.shared.model.srn import ConventionSRN
 from osa.domain.shared.outbox import Outbox
@@ -26,6 +27,7 @@ class RunHooks(EventHandler[IngesterBatchReady]):
     __claim_timeout__ = 3600.0
 
     ingest_repo: IngestRunRepository
+    ingest_service: IngestService
     convention_service: ConventionService
     hook_service: HookService
     outbox: Outbox
@@ -82,11 +84,22 @@ async def handle(self, event: IngesterBatchReady) -> None:
             )
 
         # Run all hooks via HookService
-        results = await self.hook_service.run_hooks_for_batch(
-            hooks=convention.hooks,
-            inputs=inputs,
-            work_dirs=work_dirs,
-        )
+        try:
+            results = await self.hook_service.run_hooks_for_batch(
+                hooks=convention.hooks,
+                inputs=inputs,
+                work_dirs=work_dirs,
+            )
+        except PermanentError as e:
+            log.error(
+                "[{short_id}] batch {batch_index} permanently failed: {error}",
+                short_id=event.ingest_run_srn.rsplit(":", 1)[-1][:8],
+                batch_index=event.batch_index,
+                error=str(e),
+                ingest_run_srn=event.ingest_run_srn,
+            )
+            await self._fail_batch(event)
+            return
 
         short_id = event.ingest_run_srn.rsplit(":", 1)[-1][:8]
         for result in results:
@@ -108,3 +121,16 @@ async def handle(self, event: IngesterBatchReady) -> None:
                 batch_index=event.batch_index,
             )
         )
+
+    async def on_exhausted(self, event: IngesterBatchReady) -> None:
+        """Called when transient retries are exhausted — account for the failed batch."""
+        log.error(
+            "batch {batch_index} retries exhausted",
+            batch_index=event.batch_index,
+            ingest_run_srn=event.ingest_run_srn,
+        )
+        await self._fail_batch(event)
+
+    async def _fail_batch(self, event: IngesterBatchReady) -> None:
+        """Account for a permanently failed batch."""
+        await self.ingest_service.fail_batch(event.ingest_run_srn)
diff --git a/server/osa/domain/ingest/handler/run_ingester.py b/server/osa/domain/ingest/handler/run_ingester.py
index 4c3130e..a7048b7 100644
--- a/server/osa/domain/ingest/handler/run_ingester.py
+++ b/server/osa/domain/ingest/handler/run_ingester.py
@@ -1,13 +1,14 @@
-"""RunIngester — runs ingester container on IngestStarted or continuation."""
+"""RunIngester — runs ingester container on NextBatchRequested."""
 
 from uuid import uuid4
 
 from osa.domain.deposition.service.convention import ConventionService
-from osa.domain.ingest.event.events import IngestStarted, IngesterBatchReady
+from osa.domain.ingest.event.events import IngesterBatchReady, NextBatchRequested
 from osa.domain.ingest.model.ingest_run import IngestStatus
 from osa.domain.ingest.port.repository import IngestRunRepository
 from osa.domain.ingest.port.storage import IngestStoragePort
-from osa.domain.shared.error import NotFoundError
+from osa.domain.ingest.service.ingest import IngestService
+from osa.domain.shared.error import NotFoundError, PermanentError
 from osa.domain.shared.event import EventHandler, EventId
 from osa.domain.shared.model.srn import ConventionSRN
 from osa.domain.shared.outbox import Outbox
@@ -17,18 +18,19 @@
 log = get_logger(__name__)
 
 
-class RunIngester(EventHandler[IngestStarted]):
+class RunIngester(EventHandler[NextBatchRequested]):
     """Runs ingester container and emits IngesterBatchReady per batch."""
 
     __claim_timeout__ = 3600.0
 
     ingest_repo: IngestRunRepository
+    ingest_service: IngestService
     convention_service: ConventionService
     ingester_runner: IngesterRunner
     outbox: Outbox
     ingest_storage: IngestStoragePort
 
-    async def handle(self, event: IngestStarted) -> None:
+    async def handle(self, event: NextBatchRequested) -> None:
         ingest_run = await self.ingest_repo.get(event.ingest_run_srn)
         if ingest_run is None:
             raise NotFoundError(f"Ingest run not found: {event.ingest_run_srn}")
@@ -53,7 +55,7 @@ async def handle(self, event: IngestStarted) -> None:
             remaining = ingest_run.limit - ingested_so_far
             if remaining <= 0:
                 log.warn(
-                    "Ignoring redelivered IngestStarted — limit already met (batches_ingested={batches_ingested}, limit={limit})",
+                    "Ignoring redelivered NextBatchRequested — limit already met (batches_ingested={batches_ingested}, limit={limit})",
                     batches_ingested=ingest_run.batches_ingested,
                     limit=ingest_run.limit,
                     ingest_run_srn=event.ingest_run_srn,
@@ -74,12 +76,22 @@ async def handle(self, event: IngestStarted) -> None:
         work_dir = self.ingest_storage.batch_work_dir(event.ingest_run_srn, batch_index)
         files_dir = self.ingest_storage.batch_files_dir(event.ingest_run_srn, batch_index)
 
-        output = await self.ingester_runner.run(
-            ingester=convention.ingester,
-            inputs=inputs,
-            files_dir=files_dir,
-            work_dir=work_dir,
-        )
+        try:
+            output = await self.ingester_runner.run(
+                ingester=convention.ingester,
+                inputs=inputs,
+                files_dir=files_dir,
+                work_dir=work_dir,
+            )
+        except PermanentError as e:
+            log.error(
+                "[{short_id}] ingester permanently failed: {error}",
+                short_id=event.ingest_run_srn.rsplit(":", 1)[-1][:8],
+                error=str(e),
+                ingest_run_srn=event.ingest_run_srn,
+            )
+            await self._fail_ingestion(event)
+            return
 
         await self.ingest_storage.write_records(event.ingest_run_srn, batch_index, output.records)
 
@@ -119,10 +131,22 @@ async def handle(self, event: IngestStarted) -> None:
 
         if has_more:
             await self.outbox.append(
-                IngestStarted(
+                NextBatchRequested(
                     id=EventId(uuid4()),
                     ingest_run_srn=event.ingest_run_srn,
                     convention_srn=event.convention_srn,
                     batch_size=ingest_run.batch_size,
                 )
             )
+
+    async def on_exhausted(self, event: NextBatchRequested) -> None:
+        """Transient retries exhausted — stop ingestion and check completion."""
+        log.error(
+            "ingester retries exhausted",
+            ingest_run_srn=event.ingest_run_srn,
+        )
+        await self._fail_ingestion(event)
+
+    async def _fail_ingestion(self, event: NextBatchRequested) -> None:
+        """Account for a permanently failed ingester pull."""
+        await self.ingest_service.fail_ingestion(event.ingest_run_srn)
diff --git a/server/osa/domain/ingest/model/ingest_run.py b/server/osa/domain/ingest/model/ingest_run.py
index c95f604..cf72f77 100644
--- a/server/osa/domain/ingest/model/ingest_run.py
+++ b/server/osa/domain/ingest/model/ingest_run.py
@@ -36,6 +36,7 @@ class IngestRun(Aggregate):
     batches_ingested: int = 0
     batches_completed: int = 0
     published_count: int = 0
+    batches_failed: int = 0
     batch_size: int = 1000
     limit: int | None = None  # Max total records (None = unlimited)
     started_at: datetime
@@ -71,8 +72,11 @@ def record_batch_completed(self, published_count: int) -> None:
 
     @property
     def is_complete(self) -> bool:
-        """Check the completion condition: all sourced batches are completed."""
-        return self.ingestion_finished and self.batches_ingested == self.batches_completed
+        """Check the completion condition: all sourced batches are accounted for."""
+        return (
+            self.ingestion_finished
+            and (self.batches_completed + self.batches_failed) >= self.batches_ingested
+        )
 
     def check_completion(self, completed_at: datetime) -> bool:
         """Check completion condition and transition if met.
diff --git a/server/osa/domain/ingest/port/repository.py b/server/osa/domain/ingest/port/repository.py
index f5839d6..de27dc7 100644
--- a/server/osa/domain/ingest/port/repository.py
+++ b/server/osa/domain/ingest/port/repository.py
@@ -40,6 +40,14 @@ async def increment_batches_ingested(
         """
         ...
 
+    @abstractmethod
+    async def increment_failed(self, srn: str) -> IngestRun:
+        """Atomically increment batches_failed.
+
+        Returns the updated IngestRun for completion condition checking.
+        """
+        ...
+
     @abstractmethod
     async def increment_completed(self, srn: str, published_count: int) -> IngestRun:
         """Atomically increment batches_completed and published_count.
diff --git a/server/osa/domain/ingest/service/ingest.py b/server/osa/domain/ingest/service/ingest.py
index 35c338a..eb8ec1b 100644
--- a/server/osa/domain/ingest/service/ingest.py
+++ b/server/osa/domain/ingest/service/ingest.py
@@ -4,7 +4,7 @@
 from uuid import uuid4
 
 from osa.domain.deposition.service.convention import ConventionService
-from osa.domain.ingest.event.events import IngestStarted
+from osa.domain.ingest.event.events import IngestCompleted, IngestRunStarted, NextBatchRequested
 from osa.domain.ingest.model.ingest_run import IngestRun, IngestStatus
 from osa.domain.ingest.port.repository import IngestRunRepository
 from osa.domain.shared.error import ConflictError, NotFoundError
@@ -69,7 +69,16 @@ async def start_ingest(
         await self.ingest_repo.save(ingest_run)
 
         await self.outbox.append(
-            IngestStarted(
+            IngestRunStarted(
+                id=EventId(uuid4()),
+                ingest_run_srn=srn,
+                convention_srn=convention_srn,
+                batch_size=batch_size,
+            )
+        )
+
+        await self.outbox.append(
+            NextBatchRequested(
                 id=EventId(uuid4()),
                 ingest_run_srn=srn,
                 convention_srn=convention_srn,
@@ -85,3 +94,42 @@ async def start_ingest(
             limit=limit,
         )
         return ingest_run
+
+    async def fail_batch(self, ingest_run_srn: str) -> None:
+        """Account for a batch that permanently failed hook processing.
+
+        Increments batches_failed and completes the run if all batches
+        are now accounted for (completed + failed >= ingested).
+        """
+        ingest_run = await self.ingest_repo.increment_failed(ingest_run_srn)
+        await self._check_completion(ingest_run)
+
+    async def fail_ingestion(self, ingest_run_srn: str) -> None:
+        """Account for a failed ingester pull.
+
+        The batch was never sourced, so we mark ingestion as finished
+        (no more batches coming) and increment batches_failed. The
+        completion condition can then fire based on whatever batches
+        were already sourced.
+        """
+        await self.ingest_repo.increment_batches_ingested(
+            ingest_run_srn,
+            set_ingestion_finished=True,
+        )
+        ingest_run = await self.ingest_repo.increment_failed(ingest_run_srn)
+        await self._check_completion(ingest_run)
+
+    async def _check_completion(self, ingest_run: IngestRun) -> None:
+        """Transition to COMPLETED and emit IngestCompleted if all batches are accounted for."""
+        if not ingest_run.is_complete:
+            return
+        now = datetime.now(UTC)
+        ingest_run.check_completion(now)
+        await self.ingest_repo.save(ingest_run)
+        await self.outbox.append(
+            IngestCompleted(
+                id=EventId(uuid4()),
+                ingest_run_srn=ingest_run.srn,
+                total_published=ingest_run.published_count,
+            )
+        )
diff --git a/server/osa/domain/shared/error.py b/server/osa/domain/shared/error.py
index ab50e83..b4f334e 100644
--- a/server/osa/domain/shared/error.py
+++ b/server/osa/domain/shared/error.py
@@ -72,6 +72,18 @@ class ConfigurationError(InfrastructureError):
     """System misconfiguration detected."""
 
 
+class TransientError(InfrastructureError):
+    """Temporary failure — worker retries with backoff."""
+
+
+class PermanentError(InfrastructureError):
+    """Unrecoverable failure — worker gives up."""
+
+
+class OOMError(PermanentError):
+    """Container killed by out-of-memory. HookService intercepts for memory retry."""
+
+
 # =============================================================================
 # Event Processing Errors
 # =============================================================================
diff --git a/server/osa/domain/shared/event.py b/server/osa/domain/shared/event.py
index 0fc19b3..de5ef7c 100644
--- a/server/osa/domain/shared/event.py
+++ b/server/osa/domain/shared/event.py
@@ -130,6 +130,7 @@ class Delivery:
 
     id: str
     event: "Event"
+    retry_count: int = 0
 
 
 @dataclass(frozen=True)
@@ -260,6 +261,16 @@ async def handle_batch(self, events: list[E]) -> None:
         for event in events:
             await self.handle(event)
 
+    async def on_exhausted(self, event: E) -> None:
+        """Called when delivery retries are exhausted or failure is permanent.
+
+        Override to perform cleanup or accounting when an event will never
+        be successfully processed. Default: no-op.
+
+        Args:
+            event: The event that could not be processed.
+        """
+
 
 # --- Schedule ---
 
diff --git a/server/osa/domain/shared/outbox.py b/server/osa/domain/shared/outbox.py
index eb508fc..8ac7d51 100644
--- a/server/osa/domain/shared/outbox.py
+++ b/server/osa/domain/shared/outbox.py
@@ -1,5 +1,6 @@
 """Outbox - domain service for reliable event delivery."""
 
+from datetime import datetime
 from typing import TypeVar
 
 from osa.domain.shared.event import ClaimResult, Event
@@ -80,6 +81,7 @@ async def mark_failed_with_retry(
         delivery_id: str,
         error: str,
         max_retries: int,
+        deliver_after: datetime | None = None,
     ) -> None:
         """Mark a delivery as failed, with retry logic.
 
@@ -90,8 +92,12 @@ async def mark_failed_with_retry(
             delivery_id: The delivery row ID.
             error: Error message.
             max_retries: Maximum retry attempts before marking as failed.
+            deliver_after: If set, the delivery won't be eligible for claiming
+                until this timestamp. Used for transient resource backoff.
         """
-        await self._repo.mark_failed_with_retry(delivery_id, error=error, max_retries=max_retries)
+        await self._repo.mark_failed_with_retry(
+            delivery_id, error=error, max_retries=max_retries, deliver_after=deliver_after
+        )
 
     async def reset_stale_claims(self, timeout_seconds: float) -> int:
         """Reset deliveries that have been claimed for too long.
diff --git a/server/osa/domain/shared/port/event_repository.py b/server/osa/domain/shared/port/event_repository.py
index 8a222d1..d0242cb 100644
--- a/server/osa/domain/shared/port/event_repository.py
+++ b/server/osa/domain/shared/port/event_repository.py
@@ -1,5 +1,6 @@
 """EventRepository port - pure CRUD for event persistence."""
 
+from datetime import datetime
 from typing import Protocol, TypeVar
 
 from osa.domain.shared.event import ClaimResult, Event, EventId
@@ -123,6 +124,7 @@ async def mark_failed_with_retry(
         delivery_id: str,
         error: str,
         max_retries: int,
+        deliver_after: datetime | None = None,
     ) -> None:
         """Mark a delivery as failed with retry logic.
 
diff --git a/server/osa/domain/validation/model/entity.py b/server/osa/domain/validation/model/entity.py
index fdd35ca..9436895 100644
--- a/server/osa/domain/validation/model/entity.py
+++ b/server/osa/domain/validation/model/entity.py
@@ -24,8 +24,6 @@ def summary(self) -> HookStatus | None:
         if not self.results:
             return None
         statuses = [r.status for r in self.results]
-        if HookStatus.FAILED in statuses:
-            return HookStatus.FAILED
         if HookStatus.REJECTED in statuses:
             return HookStatus.REJECTED
         return HookStatus.PASSED
diff --git a/server/osa/domain/validation/model/hook_result.py b/server/osa/domain/validation/model/hook_result.py
index ba4b959..7841d6f 100644
--- a/server/osa/domain/validation/model/hook_result.py
+++ b/server/osa/domain/validation/model/hook_result.py
@@ -10,8 +10,6 @@
 class HookStatus(StrEnum):
     PASSED = "passed"
     REJECTED = "rejected"
-    FAILED = "failed"
-    OOM = "oom"
 
 
 class ProgressEntry(ValueObject):
@@ -31,8 +29,3 @@ class HookResult(ValueObject):
     error_message: str | None = None
     progress: list[ProgressEntry] = Field(default_factory=list)
     duration_seconds: float
-
-    @property
-    def oom_killed(self) -> bool:
-        """Whether this hook was killed by an out-of-memory condition."""
-        return self.status == HookStatus.OOM
diff --git a/server/osa/domain/validation/service/hook.py b/server/osa/domain/validation/service/hook.py
index 3c770a4..4718578 100644
--- a/server/osa/domain/validation/service/hook.py
+++ b/server/osa/domain/validation/service/hook.py
@@ -13,6 +13,7 @@
 from collections.abc import Iterable
 from pathlib import Path
 
+from osa.domain.shared.error import OOMError
 from osa.domain.shared.model.hook import HookDefinition
 from osa.domain.shared.service import Service
 from osa.domain.validation.model.batch_outcome import (
@@ -81,16 +82,15 @@ async def run_hook(
                 config=inputs.config,
             )
 
-            result = await self.hook_runner.run(current_hook, attempt_inputs, work_dir)
-            total_duration += result.duration_seconds
-
-            # Read any output written by this attempt
-            new_outcomes = _read_output_dir(work_dir)
-            for rid, outcome in new_outcomes.items():
-                if rid not in outcomes:
-                    outcomes[rid] = outcome
+            try:
+                result = await self.hook_runner.run(current_hook, attempt_inputs, work_dir)
+            except OOMError:
+                # Read any partial output written before OOM
+                new_outcomes = _read_output_dir(work_dir)
+                for rid, outcome in new_outcomes.items():
+                    if rid not in outcomes:
+                        outcomes[rid] = outcome
 
-            if result.oom_killed:
                 # Checkpoint what we have so far
                 await self.hook_storage.write_checkpoint(work_dir, outcomes)
 
@@ -118,17 +118,19 @@ async def run_hook(
                             error=f"OOM after {MAX_OOM_RETRIES} retries (last limit: {current_hook.runtime.limits.memory})",
                         )
                     await self.hook_storage.write_batch_outcomes(work_dir, outcomes)
-                    return HookResult(
-                        hook_name=hook.name,
-                        status=HookStatus.OOM,
-                        error_message=f"OOM exhausted after {MAX_OOM_RETRIES} retries",
-                        duration_seconds=total_duration,
-                    )
-            elif result.status == HookStatus.FAILED:
-                # Non-OOM failure — no retry
-                await self.hook_storage.write_batch_outcomes(work_dir, outcomes)
-                return result
-            elif result.status == HookStatus.REJECTED:
+                    raise
+            # Non-OOM exceptions (TransientError, PermanentError, etc.)
+            # propagate uncaught to the worker layer
+
+            total_duration += result.duration_seconds
+
+            # Read any output written by this attempt
+            new_outcomes = _read_output_dir(work_dir)
+            for rid, outcome in new_outcomes.items():
+                if rid not in outcomes:
+                    outcomes[rid] = outcome
+
+            if result.status == HookStatus.REJECTED:
                 # Rejection — no retry, propagate status
                 await self.hook_storage.write_batch_outcomes(work_dir, outcomes)
                 return HookResult(
diff --git a/server/osa/domain/validation/service/validation.py b/server/osa/domain/validation/service/validation.py
index a2dcdee..a675f5b 100644
--- a/server/osa/domain/validation/service/validation.py
+++ b/server/osa/domain/validation/service/validation.py
@@ -85,12 +85,13 @@ async def run_hooks(
 
         for hook in hooks:
             work_dir = self.hook_storage.get_hook_output_dir(deposition_srn, hook.name)
-            result = await hook_service.run_hook(hook, inputs, work_dir)
-            hook_results.append(result)
-
-            if result.status in (HookStatus.FAILED, HookStatus.OOM):
+            try:
+                result = await hook_service.run_hook(hook, inputs, work_dir)
+            except Exception:
                 overall_status = RunStatus.FAILED
                 break
+            hook_results.append(result)
+
             if result.status == HookStatus.REJECTED:
                 overall_status = RunStatus.REJECTED
                 break
diff --git a/server/osa/infrastructure/event/worker.py b/server/osa/infrastructure/event/worker.py
index 7721c2a..1b26ba5 100644
--- a/server/osa/infrastructure/event/worker.py
+++ b/server/osa/infrastructure/event/worker.py
@@ -1,9 +1,9 @@
 """Worker and WorkerPool for pull-based event processing."""
 
 import asyncio
-import logging
 from contextlib import AsyncExitStack
 from dataclasses import dataclass, field
+from datetime import UTC, datetime, timedelta
 from typing import TYPE_CHECKING, Any, NewType
 
 if TYPE_CHECKING:
@@ -13,7 +13,7 @@
 from apscheduler.triggers.cron import CronTrigger
 from dishka import AsyncContainer
 from osa.domain.auth.model.identity import Identity, System
-from osa.domain.shared.error import SkippedEvents
+from osa.domain.shared.error import PermanentError, SkippedEvents, TransientError
 from osa.domain.shared.event import (
     EventHandler,
     Schedule,
@@ -22,9 +22,10 @@
     WorkerStatus,
 )
 from osa.domain.shared.outbox import Outbox
+from osa.infrastructure.logging import get_logger
 from osa.util.di.scope import Scope
 
-logger = logging.getLogger(__name__)
+logger = get_logger(__name__)
 
 
 @dataclass
@@ -136,7 +137,7 @@ async def _run(self) -> None:
             logger.info(f"Worker '{self.name}' cancelled")
             raise
         except Exception as e:
-            logger.exception(f"Worker '{self.name}' crashed: {e}")
+            logger.error(f"Worker '{self.name}' crashed: {e}")
             self._state.error = e
             raise
         finally:
@@ -171,8 +172,9 @@ async def _poll_once(self) -> bool:
             self._state.current_batch = result.events
             self._state.last_claim_at = result.claimed_at
 
+            handler = await scope.get(self._handler_type)
+
             try:
-                handler = await scope.get(self._handler_type)
                 events = result.events
 
                 if self._batch_size > 1:
@@ -187,9 +189,7 @@ async def _poll_once(self) -> bool:
                 self._state.processed_count += len(result.deliveries)
 
             except SkippedEvents as e:
-                logger.warning(
-                    f"Worker '{self.name}' skipping {len(e.event_ids)} events: {e.reason}"
-                )
+                logger.warn(f"Worker '{self.name}' skipping {len(e.event_ids)} events: {e.reason}")
                 skipped_set = set(e.event_ids)
                 for delivery in result.deliveries:
                     if delivery.event.id in skipped_set:
@@ -198,16 +198,71 @@ async def _poll_once(self) -> bool:
                         await outbox.mark_delivered(delivery.id)
                 self._state.processed_count += len(result.deliveries) - len(e.event_ids)
 
+            except TransientError as e:
+                self._state.failed_count += len(result.deliveries)
+                self._state.error = e
+                for delivery in result.deliveries:
+                    exhausted = delivery.retry_count + 1 >= self._max_retries
+                    if exhausted:
+                        logger.warn(
+                            "Worker '{name}' transient retries exhausted: {error}",
+                            name=self.name,
+                            error=str(e),
+                        )
+                        await handler.on_exhausted(delivery.event)
+                        await outbox.mark_failed(delivery.id, str(e))
+                    else:
+                        backoff_seconds = min(300, 60 * (2**delivery.retry_count))
+                        deliver_after = datetime.now(UTC) + timedelta(seconds=backoff_seconds)
+                        logger.warn(
+                            "Worker '{name}' transient failure: {error} "
+                            "(attempt={attempt}, next_retry={next_retry})",
+                            name=self.name,
+                            error=str(e),
+                            attempt=delivery.retry_count,
+                            next_retry=deliver_after.isoformat(),
+                        )
+                        await outbox.mark_failed_with_retry(
+                            delivery.id,
+                            str(e),
+                            max_retries=self._max_retries,
+                            deliver_after=deliver_after,
+                        )
+
+            except PermanentError as e:
+                self._state.failed_count += len(result.deliveries)
+                self._state.error = e
+                logger.error(
+                    "Worker '{name}' permanent failure: {error}",
+                    name=self.name,
+                    error=str(e),
+                )
+                for delivery in result.deliveries:
+                    await handler.on_exhausted(delivery.event)
+                    await outbox.mark_failed(delivery.id, str(e))
+
             except Exception as e:
                 self._state.failed_count += len(result.deliveries)
                 self._state.error = e
-                logger.error(f"Worker '{self.name}' batch failed: {e}")
+                logger.error(
+                    "Worker '{name}' batch failed: {error}",
+                    name=self.name,
+                    error=str(e),
+                )
                 for delivery in result.deliveries:
-                    await outbox.mark_failed_with_retry(
-                        delivery.id,
-                        str(e),
-                        max_retries=self._max_retries,
-                    )
+                    exhausted = delivery.retry_count + 1 >= self._max_retries
+                    if exhausted:
+                        await handler.on_exhausted(delivery.event)
+                        await outbox.mark_failed(delivery.id, str(e))
+                    else:
+                        backoff_seconds = min(30, 5 ** (delivery.retry_count + 1))
+                        deliver_after = datetime.now(UTC) + timedelta(seconds=backoff_seconds)
+                        await outbox.mark_failed_with_retry(
+                            delivery.id,
+                            str(e),
+                            max_retries=self._max_retries,
+                            deliver_after=deliver_after,
+                        )
 
             finally:
                 self._state.current_batch = []
@@ -406,7 +461,9 @@ async def _run_schedule(self, config: "ScheduleConfig") -> None:
             self._schedule_failures[config.id] = failures
             logger.error(f"Failed to run schedule {config.id} (failures: {failures}): {e}")
             if failures >= 5:
-                logger.critical(f"Schedule {config.id} has failed {failures} consecutive times")
+                logger.error(
+                    f"CRITICAL: Schedule {config.id} has failed {failures} consecutive times"
+                )
 
     async def __aenter__(self) -> "WorkerPool":
         """Start the pool as async context manager."""
diff --git a/server/osa/infrastructure/ingest/di.py b/server/osa/infrastructure/ingest/di.py
index 5992072..fbc4c8f 100644
--- a/server/osa/infrastructure/ingest/di.py
+++ b/server/osa/infrastructure/ingest/di.py
@@ -50,7 +50,7 @@ def get_ingest_service(
     # Ingest storage — default (filesystem, for local/Docker)
     @provide(scope=Scope.APP)
     def get_ingest_storage(self, layout: StorageLayout) -> IngestStoragePort:
-        return FilesystemIngestStorage(layout=layout)  # type: ignore[return-value]
+        return FilesystemIngestStorage(layout=layout)
 
     # Ingest storage — K8s (S3 via aioboto3, reuses S3Client from RunnerProvider)
     @provide(when=K8S, scope=Scope.APP)
@@ -59,7 +59,7 @@ def get_ingest_storage_s3(
     ) -> IngestStoragePort:
         from osa.infrastructure.s3.ingest_storage import S3IngestStorage
 
-        return S3IngestStorage(  # type: ignore[return-value]
+        return S3IngestStorage(
             s3=s3,
             layout=layout,
             data_mount_path=config.runner.k8s.data_mount_path,
diff --git a/server/osa/infrastructure/k8s/errors.py b/server/osa/infrastructure/k8s/errors.py
index 3387478..1d74523 100644
--- a/server/osa/infrastructure/k8s/errors.py
+++ b/server/osa/infrastructure/k8s/errors.py
@@ -3,27 +3,27 @@
 Maps kubernetes-asyncio ApiException status codes to OSA error types.
 """
 
-from osa.domain.shared.error import ConfigurationError, InfrastructureError, OSAError
+from osa.domain.shared.error import OSAError, PermanentError, TransientError
 
 
 def classify_api_error(exc: Exception) -> OSAError:
     """Classify a K8s API error by HTTP status code.
 
-    - 403 → ConfigurationError (RBAC misconfiguration, not retried)
-    - 404 → ConfigurationError (namespace/resource missing, not retried)
-    - 500, 503 → InfrastructureError (transient, retried by outbox)
-    - Other → InfrastructureError
+    - 403 → PermanentError (RBAC misconfiguration, not retried)
+    - 404 → PermanentError (namespace/resource missing, not retried)
+    - 500, 503 → TransientError (cluster pressure, retried with backoff)
+    - Other → TransientError
     """
     status = getattr(exc, "status", 0)
     reason = getattr(exc, "reason", str(exc))
 
     if status == 403:
-        return ConfigurationError(
+        return PermanentError(
             f"K8s RBAC permission denied: {reason}. "
             "Check ServiceAccount permissions for the OSA namespace."
         )
     if status == 404:
-        return ConfigurationError(
+        return PermanentError(
             f"K8s resource not found: {reason}. Check that the namespace and resources exist."
         )
-    return InfrastructureError(f"K8s API error ({status}): {reason}")
+    return TransientError(f"K8s API error ({status}): {reason}")
diff --git a/server/osa/infrastructure/k8s/ingester_runner.py b/server/osa/infrastructure/k8s/ingester_runner.py
index fa29ec1..d505272 100644
--- a/server/osa/infrastructure/k8s/ingester_runner.py
+++ b/server/osa/infrastructure/k8s/ingester_runner.py
@@ -10,7 +10,12 @@
 from typing import TYPE_CHECKING
 
 from osa.config import K8sConfig
-from osa.domain.shared.error import ExternalServiceError, InfrastructureError
+from osa.domain.shared.error import (
+    InfrastructureError,
+    OOMError,
+    PermanentError,
+    TransientError,
+)
 from osa.domain.shared.model.source import IngesterDefinition
 from osa.domain.shared.model.srn import ConventionSRN
 from osa.domain.shared.port.ingester_runner import IngesterInputs, IngesterOutput, IngesterRunner
@@ -22,7 +27,7 @@
 )
 
 if TYPE_CHECKING:
-    from kubernetes_asyncio.client import ApiClient, BatchV1Api, CoreV1Api, V1Job
+    from kubernetes_asyncio.client import ApiClient, V1Job
 
     from osa.infrastructure.s3.client import S3Client
 
@@ -40,11 +45,13 @@ class K8sIngesterRunner(IngesterRunner):
     - Three volume mounts: input (ro), output (rw), files (rw)
     - Higher resource defaults (3600s, 4g)
     - Source-specific env vars (OSA_FILES, OSA_SINCE, etc.)
-    - Errors raise ExternalServiceError (not returned as result values)
     """
 
     def __init__(self, api_client: ApiClient, config: K8sConfig, s3: S3Client) -> None:
-        self._api_client = api_client
+        from kubernetes_asyncio.client import BatchV1Api, CoreV1Api
+
+        self._batch_api = BatchV1Api(api_client)
+        self._core_api = CoreV1Api(api_client)
         self._config = config
         self._s3 = s3
 
@@ -59,18 +66,6 @@ async def run(
         files_dir: Path,
         work_dir: Path,
     ) -> IngesterOutput:
-        try:
-            from kubernetes_asyncio.client import BatchV1Api, CoreV1Api
-        except ImportError:
-            from osa.domain.shared.error import ConfigurationError
-
-            raise ConfigurationError(
-                "kubernetes-asyncio is required for K8s runner. Install with: pip install osa[k8s]"
-            )
-
-        batch_api = BatchV1Api(self._api_client)
-        core_api = CoreV1Api(self._api_client)
-
         # Write input files to S3 (container reads them via PVC/S3 CSI)
         input_prefix = self._s3_prefix(work_dir, "input")
 
@@ -81,26 +76,14 @@ async def run(
         if inputs.session:
             await self._s3.put_object(f"{input_prefix}/session.json", json.dumps(inputs.session))
 
-        return await self._run_job(
-            batch_api,
-            core_api,
-            ingester,
-            inputs,
-            work_dir,
-            files_dir,
-            convention_srn=inputs.convention_srn,
-        )
+        return await self._run_job(ingester, inputs, work_dir, files_dir)
 
     async def _run_job(
         self,
-        batch_api: BatchV1Api,
-        core_api: CoreV1Api,
         ingester: IngesterDefinition,
         inputs: IngesterInputs,
         work_dir: Path,
         files_dir: Path,
-        *,
-        convention_srn: ConventionSRN | None = None,
     ) -> IngesterOutput:
         """Core Job lifecycle for ingester execution."""
         namespace = self._config.namespace
@@ -109,7 +92,7 @@ async def _run_job(
         try:
             # Check for existing Jobs
             existing = await self._check_existing_job(
-                batch_api, namespace, convention_srn, ingester.digest
+                namespace, inputs.convention_srn, ingester.digest
             )
 
             if existing == "succeeded":
@@ -123,11 +106,11 @@ async def _run_job(
                     work_dir=work_dir,
                     files_dir=files_dir,
                     inputs=inputs,
-                    convention_srn=convention_srn,
+                    convention_srn=inputs.convention_srn,
                 )
                 job_name_to_watch = spec.metadata.name
 
-                await batch_api.create_namespaced_job(namespace, spec)
+                await self._batch_api.create_namespaced_job(namespace, spec)
                 logger.info(
                     "Created K8s ingester Job",
                     extra={
@@ -138,36 +121,29 @@ async def _run_job(
                 )
 
             # Phase 1: Scheduling
-            await self._wait_for_scheduling(core_api, job_name_to_watch, namespace)
+            await self._wait_for_scheduling(job_name_to_watch, namespace)
 
-            # Phase 2: Completion
-            result = await self._wait_for_completion(
-                batch_api,
+            # Phase 2: Completion (raises on failure)
+            await self._wait_for_completion(
                 job_name_to_watch,
                 namespace,
                 timeout_seconds=ingester.limits.timeout_seconds + 30,
             )
 
-            if result == "succeeded":
-                output = await self._parse_source_output(work_dir, files_dir)
-                logger.info(
-                    "Source completed",
-                    extra={
-                        "job_name": job_name_to_watch,
-                        "record_count": len(output.records),
-                        "has_session": output.session is not None,
-                    },
-                )
-                return output
-
-            # Failed — diagnose and raise
-            await self._diagnose_and_raise(core_api, job_name_to_watch, namespace, ingester, result)
-            # unreachable but satisfies type checker
-            raise ExternalServiceError("Source failed")
+            output = await self._parse_source_output(work_dir, files_dir)
+            logger.info(
+                "Source completed",
+                extra={
+                    "job_name": job_name_to_watch,
+                    "record_count": len(output.records),
+                    "has_session": output.session is not None,
+                },
+            )
+            return output
 
         finally:
             if job_name_to_watch:
-                await self._cleanup_job(batch_api, job_name_to_watch, namespace)
+                await self._cleanup_job(job_name_to_watch, namespace)
 
     async def _parse_source_output(self, work_dir: Path, files_dir: Path) -> IngesterOutput:
         from osa.infrastructure.runner_utils import (
@@ -182,7 +158,6 @@ async def _parse_source_output(self, work_dir: Path, files_dir: Path) -> Ingeste
 
     async def _check_existing_job(
         self,
-        batch_api: BatchV1Api,
         namespace: str,
         convention_srn: ConventionSRN | None,
         digest: str = "",
@@ -195,7 +170,9 @@ async def _check_existing_job(
         label_selector = ",".join(label_parts)
 
         try:
-            job_list = await batch_api.list_namespaced_job(namespace, label_selector=label_selector)
+            job_list = await self._batch_api.list_namespaced_job(
+                namespace, label_selector=label_selector
+            )
         except Exception as exc:
             raise classify_api_error(exc) from exc
 
@@ -334,7 +311,6 @@ def _relative_path(self, path: Path) -> str:
 
     async def _wait_for_scheduling(
         self,
-        core_api: CoreV1Api,
         job_name: str,
         namespace: str,
         *,
@@ -346,7 +322,7 @@ async def _wait_for_scheduling(
 
         while time.monotonic() < deadline:
             try:
-                pod_list = await core_api.list_namespaced_pod(
+                pod_list = await self._core_api.list_namespaced_pod(
                     namespace, label_selector=label_selector
                 )
             except Exception as exc:
@@ -356,13 +332,13 @@ async def _wait_for_scheduling(
                 phase = pod.status.phase
                 if phase == "Failed":
                     reason = getattr(pod.status, "reason", None) or "Unknown"
-                    raise InfrastructureError(f"Pod failed during scheduling: {reason}")
+                    raise TransientError(f"Pod failed during scheduling: {reason}")
 
                 if phase == "Pending" and pod.status.container_statuses:
                     for cs in pod.status.container_statuses:
                         waiting = getattr(cs.state, "waiting", None)
                         if waiting and waiting.reason in ("ImagePullBackOff", "ErrImagePull"):
-                            raise InfrastructureError(
+                            raise PermanentError(
                                 f"Image pull failed: {waiting.reason}: {getattr(waiting, 'message', '')}"
                             )
 
@@ -371,87 +347,82 @@ async def _wait_for_scheduling(
 
             await asyncio.sleep(poll_interval)
 
-        raise InfrastructureError(
-            f"Pod scheduling timeout after {timeout_seconds}s for Job {job_name}"
-        )
+        raise TransientError(f"Pod scheduling timeout after {timeout_seconds}s for Job {job_name}")
 
     async def _wait_for_completion(
         self,
-        batch_api: BatchV1Api,
         job_name: str,
         namespace: str,
         *,
         timeout_seconds: float = 3630,
         poll_interval: float = 5.0,
-    ) -> str:
+    ) -> None:
+        """Wait for Job to complete. Returns on success, raises on failure."""
         deadline = time.monotonic() + timeout_seconds
 
         while time.monotonic() < deadline:
             try:
-                job = await batch_api.read_namespaced_job(job_name, namespace)
+                job = await self._batch_api.read_namespaced_job(job_name, namespace)
             except Exception as exc:
                 raise classify_api_error(exc) from exc
 
             if job.status.succeeded:
-                return "succeeded"
+                return
             if job.status.conditions:
                 for condition in job.status.conditions:
                     if condition.type == "Failed" and condition.status == "True":
-                        return f"failed:{getattr(condition, 'reason', 'Unknown')}"
+                        failure_reason = getattr(condition, "reason", "Unknown")
+                        raise await self._diagnose_failure(job_name, namespace, failure_reason)
                     if condition.type == "Complete" and condition.status == "True":
-                        return "succeeded"
+                        return
             if job.status.failed:
-                return "failed:BackoffLimitExceeded"
+                raise await self._diagnose_failure(job_name, namespace, "BackoffLimitExceeded")
 
             await asyncio.sleep(poll_interval)
 
         # Timed out — poll once more to catch last-millisecond completions
         try:
-            job = await batch_api.read_namespaced_job(job_name, namespace)
+            job = await self._batch_api.read_namespaced_job(job_name, namespace)
             if job.status.succeeded:
-                return "succeeded"
+                return
         except Exception:
             pass
 
-        return "failed:WatchTimeout"
+        raise TransientError(f"Watch timeout waiting for ingester Job {job_name} completion")
 
-    async def _diagnose_and_raise(
+    async def _diagnose_failure(
         self,
-        core_api: CoreV1Api,
         job_name: str,
         namespace: str,
-        ingester: IngesterDefinition,
         failure_info: str,
-    ) -> None:
-        """Determine failure reason and raise appropriate error."""
+    ) -> InfrastructureError:
+        """Inspect pod status and return the appropriate exception."""
         if "DeadlineExceeded" in failure_info:
-            raise ExternalServiceError(
-                f"Ingester timed out after {ingester.limits.timeout_seconds}s"
-            )
+            return TransientError("Ingester timed out (deadline exceeded)")
 
         try:
             label_selector = f"job-name={job_name}"
-            pod_list = await core_api.list_namespaced_pod(namespace, label_selector=label_selector)
+            pod_list = await self._core_api.list_namespaced_pod(
+                namespace, label_selector=label_selector
+            )
             for pod in pod_list.items:
                 if pod.status.container_statuses:
                     for cs in pod.status.container_statuses:
                         terminated = getattr(cs.state, "terminated", None)
                         if terminated:
                             if getattr(terminated, "reason", None) == "OOMKilled":
-                                raise ExternalServiceError("Source killed by OOM")
+                                return OOMError("Source killed by OOM")
                             exit_code = getattr(terminated, "exit_code", -1)
                             if exit_code != 0:
-                                raise ExternalServiceError(f"Source exited with code {exit_code}")
-        except ExternalServiceError:
-            raise
+                                return TransientError(f"Source exited with code {exit_code}")
         except Exception:
             pass
 
-        raise ExternalServiceError(f"Source failed: {failure_info}")
+        return PermanentError(f"Source failed: {failure_info}")
 
-    async def _cleanup_job(self, batch_api: BatchV1Api, job_name: str, namespace: str) -> None:
+    async def _cleanup_job(self, job_name: str, namespace: str) -> None:
         try:
-            await batch_api.delete_namespaced_job(
+            await self._batch_api.delete_namespaced_job(
                 job_name,
                 namespace,
                 propagation_policy="Background",
diff --git a/server/osa/infrastructure/k8s/runner.py b/server/osa/infrastructure/k8s/runner.py
index 6d5ca8a..46ff230 100644
--- a/server/osa/infrastructure/k8s/runner.py
+++ b/server/osa/infrastructure/k8s/runner.py
@@ -10,7 +10,12 @@
 from typing import TYPE_CHECKING
 
 from osa.config import K8sConfig
-from osa.domain.shared.error import InfrastructureError
+from osa.domain.shared.error import (
+    InfrastructureError,
+    OOMError,
+    PermanentError,
+    TransientError,
+)
 from osa.domain.shared.model.hook import HookDefinition
 from osa.domain.validation.model.hook_result import HookResult, HookStatus
 from osa.domain.validation.port.hook_runner import HookInputs, HookRunner
@@ -23,7 +28,7 @@
 )
 
 if TYPE_CHECKING:
-    from kubernetes_asyncio.client import ApiClient, BatchV1Api, CoreV1Api, V1Job
+    from kubernetes_asyncio.client import ApiClient, V1Job
 
     from osa.infrastructure.s3.client import S3Client
 
@@ -43,7 +48,10 @@ class K8sHookRunner(HookRunner):
     """
 
     def __init__(self, api_client: ApiClient, config: K8sConfig, s3: S3Client) -> None:
-        self._api_client = api_client
+        from kubernetes_asyncio.client import BatchV1Api, CoreV1Api
+
+        self._batch_api = BatchV1Api(api_client)
+        self._core_api = CoreV1Api(api_client)
         self._config = config
         self._s3 = s3
 
@@ -57,18 +65,6 @@ async def run(
         inputs: HookInputs,
         work_dir: Path,
     ) -> HookResult:
-        try:
-            from kubernetes_asyncio.client import BatchV1Api, CoreV1Api
-        except ImportError:
-            from osa.domain.shared.error import ConfigurationError
-
-            raise ConfigurationError(
-                "kubernetes-asyncio is required for K8s runner. Install with: pip install osa[k8s]"
-            )
-
-        batch_api = BatchV1Api(self._api_client)
-        core_api = CoreV1Api(self._api_client)
-
         # Write input files to S3 (container reads them via PVC/S3 CSI)
         input_prefix = self._s3_prefix(work_dir, "input")
         # Write records.jsonl (unified batch contract)
@@ -78,18 +74,10 @@ async def run(
             config = {**hook.runtime.config, **(inputs.config or {})}
             await self._s3.put_object(f"{input_prefix}/config.json", json.dumps(config))
 
-        return await self._run_job(
-            batch_api,
-            core_api,
-            hook,
-            inputs,
-            work_dir,
-        )
+        return await self._run_job(hook, inputs, work_dir)
 
     async def _run_job(
         self,
-        batch_api: BatchV1Api,
-        core_api: CoreV1Api,
         hook: HookDefinition,
         inputs: HookInputs,
         work_dir: Path,
@@ -102,9 +90,7 @@ async def _run_job(
         job_name_to_watch = None
 
         try:
-            existing = await self._check_existing_job(
-                batch_api, namespace, hook.name, inputs.run_id
-            )
+            existing = await self._check_existing_job(namespace, hook.name, inputs.run_id)
 
             if existing == "succeeded":
                 # Read output from completed Job
@@ -129,7 +115,7 @@ async def _run_job(
                 )
                 job_name_to_watch = spec.metadata.name
 
-                await batch_api.create_namespaced_job(namespace, spec)
+                await self._batch_api.create_namespaced_job(namespace, spec)
                 logger.info(
                     "Created K8s Job",
                     extra={
@@ -142,27 +128,20 @@ async def _run_job(
                 )
 
             # Phase 1: Wait for scheduling
-            await self._wait_for_scheduling(core_api, job_name_to_watch, namespace)
+            await self._wait_for_scheduling(job_name_to_watch, namespace)
 
-            # Phase 2: Wait for completion
-            result = await self._wait_for_completion(
-                batch_api,
+            # Phase 2: Wait for completion (raises on failure)
+            await self._wait_for_completion(
                 job_name_to_watch,
                 namespace,
                 timeout_seconds=hook.runtime.limits.timeout_seconds + 30,
             )
 
-            if result == "succeeded":
-                return await self._parse_hook_result(hook, work_dir, start_time)
-
-            # Job failed — determine why
-            return await self._diagnose_failure(
-                core_api, job_name_to_watch, namespace, hook, start_time, result
-            )
+            return await self._parse_hook_result(hook, work_dir, start_time)
 
         finally:
             if job_name_to_watch:
-                await self._cleanup_job(batch_api, job_name_to_watch, namespace)
+                await self._cleanup_job(job_name_to_watch, namespace)
 
     async def _parse_hook_result(
         self, hook: HookDefinition, work_dir: Path, start_time: float
@@ -193,7 +172,6 @@ async def _parse_hook_result(
 
     async def _check_existing_job(
         self,
-        batch_api: BatchV1Api,
         namespace: str,
         hook_name: str,
         run_id: str,
@@ -207,7 +185,9 @@ async def _check_existing_job(
         """
         label_selector = f"osa.io/hook={hook_name},osa.io/run-id={label_value(run_id)}"
         try:
-            job_list = await batch_api.list_namespaced_job(namespace, label_selector=label_selector)
+            job_list = await self._batch_api.list_namespaced_job(
+                namespace, label_selector=label_selector
+            )
         except Exception as exc:
             raise classify_api_error(exc) from exc
 
@@ -355,7 +335,6 @@ def _relative_path(self, path: Path) -> str:
 
     async def _wait_for_scheduling(
         self,
-        core_api: CoreV1Api,
         job_name: str,
         namespace: str,
         *,
@@ -368,7 +347,7 @@ async def _wait_for_scheduling(
 
         while time.monotonic() < deadline:
             try:
-                pod_list = await core_api.list_namespaced_pod(
+                pod_list = await self._core_api.list_namespaced_pod(
                     namespace, label_selector=label_selector
                 )
             except Exception as exc:
@@ -380,7 +359,7 @@ async def _wait_for_scheduling(
                 # Check for eviction
                 if phase == "Failed":
                     reason = getattr(pod.status, "reason", None) or "Unknown"
-                    raise InfrastructureError(f"Pod evicted or failed during scheduling: {reason}")
+                    raise TransientError(f"Pod evicted or failed during scheduling: {reason}")
 
                 # Check for image pull errors
                 if phase == "Pending" and pod.status.container_statuses:
@@ -388,126 +367,96 @@ async def _wait_for_scheduling(
                         waiting = getattr(cs.state, "waiting", None)
                         if waiting and waiting.reason in ("ImagePullBackOff", "ErrImagePull"):
                             message = getattr(waiting, "message", "")
-                            raise InfrastructureError(
-                                f"Image pull failed: {waiting.reason}: {message}"
-                            )
+                            raise PermanentError(f"Image pull failed: {waiting.reason}: {message}")
 
                 if phase in ("Running", "Succeeded", "Failed"):
                     return  # Pod scheduled
 
             await asyncio.sleep(poll_interval)
 
-        raise InfrastructureError(
-            f"Pod scheduling timeout after {timeout_seconds}s for Job {job_name}"
-        )
+        raise TransientError(f"Pod scheduling timeout after {timeout_seconds}s for Job {job_name}")
 
     async def _wait_for_completion(
         self,
-        batch_api: BatchV1Api,
         job_name: str,
         namespace: str,
         *,
         timeout_seconds: float = 330,
         poll_interval: float = 5.0,
-    ) -> str:
-        """Wait for Job to complete (Phase 2). Returns 'succeeded' or 'failed'."""
+    ) -> None:
+        """Wait for Job to complete (Phase 2). Returns on success, raises on failure."""
         deadline = time.monotonic() + timeout_seconds
 
         while time.monotonic() < deadline:
             try:
-                job = await batch_api.read_namespaced_job(job_name, namespace)
+                job = await self._batch_api.read_namespaced_job(job_name, namespace)
             except Exception as exc:
                 raise classify_api_error(exc) from exc
 
             if job.status.succeeded:
-                return "succeeded"
+                return
 
             if job.status.conditions:
                 for condition in job.status.conditions:
                     if condition.type == "Failed" and condition.status == "True":
-                        return f"failed:{getattr(condition, 'reason', 'Unknown')}"
+                        failure_reason = getattr(condition, "reason", "Unknown")
+                        raise await self._diagnose_failure(job_name, namespace, failure_reason)
                     if condition.type == "Complete" and condition.status == "True":
-                        return "succeeded"
+                        return
 
             if job.status.failed:
-                return "failed:BackoffLimitExceeded"
+                raise await self._diagnose_failure(job_name, namespace, "BackoffLimitExceeded")
 
             await asyncio.sleep(poll_interval)
 
         # Timed out — poll once more
         try:
-            job = await batch_api.read_namespaced_job(job_name, namespace)
+            job = await self._batch_api.read_namespaced_job(job_name, namespace)
             if job.status.succeeded:
-                return "succeeded"
+                return
         except Exception:
             pass
 
-        return "failed:WatchTimeout"
+        raise TransientError(f"Watch timeout waiting for Job {job_name} completion")
 
     async def _diagnose_failure(
         self,
-        core_api: CoreV1Api,
         job_name: str,
         namespace: str,
-        hook: HookDefinition,
-        start_time: float,
         failure_info: str,
-    ) -> HookResult:
-        """Determine failure reason from pod status."""
-        duration = time.monotonic() - start_time
-
-        # Check if DeadlineExceeded
+    ) -> InfrastructureError:
+        """Inspect pod status and return the appropriate exception."""
         if "DeadlineExceeded" in failure_info:
-            return HookResult(
-                hook_name=hook.name,
-                status=HookStatus.FAILED,
-                error_message="Hook timed out (deadline exceeded)",
-                duration_seconds=duration,
-            )
+            return TransientError("Hook timed out (deadline exceeded)")
 
-        # Check pod for OOM or exit code
         try:
             label_selector = f"job-name={job_name}"
-            pod_list = await core_api.list_namespaced_pod(namespace, label_selector=label_selector)
+            pod_list = await self._core_api.list_namespaced_pod(
+                namespace, label_selector=label_selector
+            )
             for pod in pod_list.items:
                 if pod.status.container_statuses:
                     for cs in pod.status.container_statuses:
                         terminated = getattr(cs.state, "terminated", None)
                         if terminated:
                             if getattr(terminated, "reason", None) == "OOMKilled":
-                                return HookResult(
-                                    hook_name=hook.name,
-                                    status=HookStatus.OOM,
-                                    error_message="Hook killed by OOM",
-                                    duration_seconds=duration,
-                                )
+                                return OOMError("Hook killed by OOM")
                             exit_code = getattr(terminated, "exit_code", -1)
                             if exit_code != 0:
-                                return HookResult(
-                                    hook_name=hook.name,
-                                    status=HookStatus.FAILED,
-                                    error_message=f"Hook exited with code {exit_code}",
-                                    duration_seconds=duration,
-                                )
+                                return PermanentError(f"Hook exited with code {exit_code}")
         except Exception:
             pass
 
-        return HookResult(
-            hook_name=hook.name,
-            status=HookStatus.FAILED,
-            error_message=f"Hook failed: {failure_info}",
-            duration_seconds=duration,
-        )
+        return PermanentError(f"Hook failed: {failure_info}")
 
     async def _cleanup_job(
         self,
-        batch_api: BatchV1Api,
         job_name: str,
         namespace: str,
     ) -> None:
         """Delete a Job and its pods. Ignores 404 (already cleaned up)."""
         try:
-            await batch_api.delete_namespaced_job(
+            await self._batch_api.delete_namespaced_job(
                 job_name,
                 namespace,
                 propagation_policy="Background",
diff --git a/server/osa/infrastructure/oci/ingester_runner.py b/server/osa/infrastructure/oci/ingester_runner.py
index bb926ee..3527cf9 100644
--- a/server/osa/infrastructure/oci/ingester_runner.py
+++ b/server/osa/infrastructure/oci/ingester_runner.py
@@ -8,9 +8,8 @@
 from pathlib import Path
 
 import aiodocker
-import logfire
-
-from osa.domain.shared.error import ExternalServiceError
+from osa.domain.shared.error import OOMError, TransientError
+from osa.infrastructure.logging import get_logger
 from osa.domain.shared.model.source import IngesterDefinition
 from osa.domain.shared.port.ingester_runner import IngesterInputs, IngesterOutput, IngesterRunner
 from osa.infrastructure.runner_utils import (
@@ -20,6 +19,9 @@
 )
 
 
+log = get_logger(__name__)
+
+
 class OciIngesterRunner(IngesterRunner):
     """Executes ingesters in OCI containers via aiodocker.
 
@@ -92,13 +94,13 @@ async def _resolve_and_run():
                 return result
             except asyncio.TimeoutError:
                 duration = time.monotonic() - start_time
-                logfire.error(
-                    "Ingester timed out",
+                log.error(
+                    "Ingester timed out after {timeout}s",
                     image=ingester.image,
                     timeout=timeout,
                     duration=duration,
                 )
-                raise ExternalServiceError(f"Ingester timed out after {timeout}s")
+                raise TransientError(f"Ingester timed out after {timeout}s")
         finally:
             rmtree(staging_dir, onexc=_force_remove)
 
@@ -159,22 +161,26 @@ async def _run_container(
             oom_killed = inspect_data.get("State", {}).get("OOMKilled", False)
 
             if oom_killed:
-                raise ExternalServiceError("Ingester killed by OOM")
+                raise OOMError("Ingester killed by OOM")
 
             if exit_code != 0:
                 logs = await container.log(stdout=True, stderr=True)
                 logs_str = "".join(logs) if logs else ""
-                raise ExternalServiceError(
-                    f"Ingester exited with code {exit_code}: {logs_str[:500]}"
+                log.error(
+                    "Ingester exited with code {exit_code}",
+                    exit_code=exit_code,
+                    image=ingester.image,
+                    container_logs=logs_str[:2000],
                 )
+                raise TransientError(f"Ingester exited with code {exit_code}")
 
             records = parse_records_file(output_dir)
             session = parse_session_file(output_dir)
             return IngesterOutput(records=records, session=session, files_dir=files_dir)
 
         except aiodocker.DockerError as e:
-            logfire.error("Docker error running ingester", error=str(e))
-            raise ExternalServiceError(f"Docker error: {e}") from e
+            log.error("Docker error running ingester: {error}", error=str(e))
+            raise TransientError(f"Docker error: {e}") from e
         finally:
             if container is not None:
                 try:
@@ -212,6 +218,6 @@ async def _resolve_image(self, image: str, digest: str) -> str:
             pass
 
         # Pull from registry as last resort
-        logfire.info("Pulling ingester image", image=image)
+        log.info("Pulling ingester image: {image}", image=image)
         await self._docker.images.pull(image)
         return image
diff --git a/server/osa/infrastructure/oci/runner.py b/server/osa/infrastructure/oci/runner.py
index 05414d3..88738dd 100644
--- a/server/osa/infrastructure/oci/runner.py
+++ b/server/osa/infrastructure/oci/runner.py
@@ -10,6 +10,7 @@
 from shutil import rmtree
 
 import aiodocker
+from osa.domain.shared.error import OOMError, PermanentError, TransientError
 from osa.domain.shared.model.hook import HookDefinition
 from osa.domain.validation.model.hook_result import HookResult, HookStatus
 from osa.domain.validation.port.hook_runner import HookInputs, HookRunner
@@ -99,19 +100,13 @@ async def _resolve_and_run():
                     duration_seconds=result_duration,
                 )
             except asyncio.TimeoutError:
-                duration = time.monotonic() - start_time
                 log.error(
                     "Hook timed out",
                     hook=hook.name,
                     run_id=inputs.run_id,
                     timeout=timeout,
                 )
-                return HookResult(
-                    hook_name=hook.name,
-                    status=HookStatus.FAILED,
-                    error_message=f"Hook timed out after {timeout}s",
-                    duration_seconds=duration,
-                )
+                raise TransientError(f"Hook timed out after {timeout}s")
         finally:
             rmtree(staging_dir, onexc=_force_remove)
 
@@ -191,10 +186,7 @@ async def _run_container(
                 if tail_text:
                     for line in tail_text.splitlines():
                         print(f"    OOM [{hook.name}] {line}", file=sys.stderr, flush=True)
-                return {
-                    "status": HookStatus.OOM,
-                    "error_message": f"Hook killed by OOM (limit: {hook.runtime.limits.memory})",
-                }
+                raise OOMError(f"Hook killed by OOM (limit: {hook.runtime.limits.memory})")
 
             # Parse progress file
             progress = parse_progress_file(output_dir)
@@ -211,29 +203,21 @@ async def _run_container(
             if exit_code != 0:
                 logs = await container.log(stdout=True, stderr=True)
                 logs_str = "".join(logs) if logs else ""
-                return {
-                    "status": HookStatus.FAILED,
-                    "error_message": f"Hook exited with code {exit_code}: {logs_str[:2000]}",
-                    "progress": progress,
-                }
+                raise PermanentError(f"Hook exited with code {exit_code}: {logs_str[:2000]}")
 
             return {
                 "status": HookStatus.PASSED,
                 "progress": progress,
             }
 
+        except (OOMError, PermanentError):
+            raise
         except aiodocker.DockerError as e:
             log.error("Docker error running hook", error=str(e))
-            return {
-                "status": HookStatus.FAILED,
-                "error_message": f"Docker error: {e}",
-            }
+            raise TransientError(f"Docker error: {e}") from e
         except Exception as e:
             log.error("Unexpected error running hook", error=str(e))
-            return {
-                "status": HookStatus.FAILED,
-                "error_message": f"Unexpected error: {e}",
-            }
+            raise TransientError(f"Unexpected error: {e}") from e
         finally:
             if container is not None:
                 try:
diff --git a/server/osa/infrastructure/persistence/repository/event.py b/server/osa/infrastructure/persistence/repository/event.py
index 3e09df5..e3c86e6 100644
--- a/server/osa/infrastructure/persistence/repository/event.py
+++ b/server/osa/infrastructure/persistence/repository/event.py
@@ -6,8 +6,6 @@
 from uuid import uuid4
 
 from sqlalchemy import CursorResult, func, insert, or_, select, update
-from sqlalchemy.dialects.postgresql import INTERVAL
-from sqlalchemy.sql import literal
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from osa.domain.shared.error import InfrastructureError
@@ -186,21 +184,17 @@ async def claim_delivery(
         """
         now = datetime.now(UTC)
 
-        # Backoff formula: min(30, 5^retry_count) seconds
-        backoff_seconds = func.least(
-            literal(30),
-            func.power(literal(5), deliveries_table.c.retry_count),
-        )
-        backoff_interval = func.cast(func.concat(backoff_seconds, literal(" seconds")), INTERVAL)
-        backoff_eligible = or_(
-            deliveries_table.c.retry_count == 0,
-            deliveries_table.c.updated_at <= func.now() - backoff_interval,
+        # deliver_after: NULL means immediately eligible, otherwise wait
+        deliver_after_eligible = or_(
+            deliveries_table.c.deliver_after.is_(None),
+            deliveries_table.c.deliver_after <= func.now(),
         )
 
         # Select deliveries joined with events
         stmt = (
             select(
                 deliveries_table.c.id,
+                deliveries_table.c.retry_count,
                 events_table.c.event_type,
                 events_table.c.payload,
             )
@@ -209,7 +203,7 @@ async def claim_delivery(
                 deliveries_table.c.consumer_group == consumer_group,
                 deliveries_table.c.status == "pending",
                 events_table.c.event_type.in_(event_types),
-                backoff_eligible,
+                deliver_after_eligible,
             )
             .order_by(events_table.c.created_at.asc())
             .limit(limit)
@@ -234,10 +228,10 @@ async def claim_delivery(
         # Deserialize events and wrap in Delivery envelopes
         deliveries: list[Delivery] = []
         for row in rows:
-            delivery_id, event_type, payload = row
+            delivery_id, retry_count, event_type, payload = row
             event = self._deserialize(event_type, payload)
             if event is not None:
-                deliveries.append(Delivery(id=delivery_id, event=event))
+                deliveries.append(Delivery(id=delivery_id, event=event, retry_count=retry_count))
 
         return ClaimResult(deliveries=deliveries, claimed_at=now)
 
@@ -291,8 +285,14 @@ async def mark_failed_with_retry(
         delivery_id: str,
         error: str,
         max_retries: int,
+        deliver_after: datetime | None = None,
     ) -> None:
-        """Mark a delivery as failed with retry logic."""
+        """Mark a delivery as failed with retry logic.
+
+        Args:
+            deliver_after: If set, the delivery won't be eligible for claiming
+                until this timestamp. Used for transient resource backoff.
+        """
         now = datetime.now(UTC)
 
         # Get current retry_count
@@ -318,6 +318,7 @@ async def mark_failed_with_retry(
                     status="failed",
                     delivery_error=error,
                     retry_count=new_retry_count,
+                    deliver_after=None,
                     updated_at=now,
                     delivered_at=now,
                 )
@@ -331,6 +332,7 @@ async def mark_failed_with_retry(
                     status="pending",
                     delivery_error=error,
                     retry_count=new_retry_count,
+                    deliver_after=deliver_after,
                     claimed_at=None,
                     updated_at=now,
                 )
diff --git a/server/osa/infrastructure/persistence/repository/ingest.py b/server/osa/infrastructure/persistence/repository/ingest.py
index 94d9beb..3c813e6 100644
--- a/server/osa/infrastructure/persistence/repository/ingest.py
+++ b/server/osa/infrastructure/persistence/repository/ingest.py
@@ -29,6 +29,7 @@ async def save(self, ingest_run: IngestRun) -> None:
             "batches_ingested": ingest_run.batches_ingested,
             "batches_completed": ingest_run.batches_completed,
             "published_count": ingest_run.published_count,
+            "batches_failed": ingest_run.batches_failed,
             "batch_size": ingest_run.batch_size,
             "record_limit": ingest_run.limit,
             "started_at": ingest_run.started_at,
@@ -91,6 +92,24 @@ async def increment_batches_ingested(
             raise NotFoundError(f"Ingest run not found: {srn}")
         return _row_to_ingest_run(dict(row))
 
+    async def increment_failed(self, srn: str) -> IngestRun:
+        """Atomically increment batches_failed."""
+        t = ingest_runs_table
+        stmt = (
+            update(t)
+            .where(t.c.srn == srn)
+            .values(batches_failed=t.c.batches_failed + 1)
+            .returning(*t.c)
+        )
+        result = await self._session.execute(stmt)
+        await self._session.flush()
+        row = result.mappings().first()
+        if row is None:
+            from osa.domain.shared.error import NotFoundError
+
+            raise NotFoundError(f"Ingest run not found: {srn}")
+        return _row_to_ingest_run(dict(row))
+
     async def increment_completed(self, srn: str, published_count: int) -> IngestRun:
         """Atomically increment batches_completed and published_count."""
         t = ingest_runs_table
@@ -122,6 +141,7 @@ def _row_to_ingest_run(row: dict) -> IngestRun:
         batches_ingested=row["batches_ingested"],
         batches_completed=row["batches_completed"],
         published_count=row["published_count"],
+        batches_failed=row.get("batches_failed", 0),
         batch_size=row["batch_size"],
         limit=row.get("record_limit"),
         started_at=row["started_at"],
diff --git a/server/osa/infrastructure/persistence/tables.py b/server/osa/infrastructure/persistence/tables.py
index 4f3545a..7e80d63 100644
--- a/server/osa/infrastructure/persistence/tables.py
+++ b/server/osa/infrastructure/persistence/tables.py
@@ -120,6 +120,7 @@
     Column("delivered_at", DateTime(timezone=True), nullable=True),
     Column("delivery_error", Text, nullable=True),
     Column("retry_count", Integer, nullable=False, server_default=text("0")),
+    Column("deliver_after", DateTime(timezone=True), nullable=True),
     Column("updated_at", DateTime(timezone=True), nullable=False),
     UniqueConstraint("event_id", "consumer_group", name="uq_delivery_event_consumer"),
 )
@@ -133,6 +134,13 @@
     postgresql_where=text("status IN ('pending', 'claimed')"),
 )
 
+# Deferred delivery filtering
+Index(
+    "idx_deliveries_deliver_after",
+    deliveries_table.c.deliver_after,
+    postgresql_where=text("status = 'pending'"),
+)
+
 # For joining back to events
 Index("idx_deliveries_event", deliveries_table.c.event_id)
 
@@ -319,6 +327,7 @@
     Column("published_count", Integer, nullable=False, server_default=text("0")),
     Column("batch_size", Integer, nullable=False, server_default=text("1000")),
     Column("record_limit", Integer, nullable=True),
+    Column("batches_failed", Integer, nullable=False, server_default=text("0")),
     Column("started_at", DateTime(timezone=True), nullable=False),
     Column("completed_at", DateTime(timezone=True), nullable=True),
 )
diff --git a/server/tests/unit/domain/ingest/test_ingest_run.py b/server/tests/unit/domain/ingest/test_ingest_run.py
index 55f5395..7086d8c 100644
--- a/server/tests/unit/domain/ingest/test_ingest_run.py
+++ b/server/tests/unit/domain/ingest/test_ingest_run.py
@@ -142,3 +142,74 @@ def test_batch_size_default(self) -> None:
     def test_custom_batch_size(self) -> None:
         run = _make_run(batch_size=500)
         assert run.batch_size == 500
+
+
+class TestBatchFailureAccounting:
+    def test_batches_failed_defaults_to_zero(self) -> None:
+        run = _make_run()
+        assert run.batches_failed == 0
+
+    def test_complete_with_all_batches_succeeded(self) -> None:
+        run = _make_run(
+            status=IngestStatus.RUNNING,
+            ingestion_finished=True,
+            batches_ingested=3,
+            batches_completed=3,
+            batches_failed=0,
+        )
+        assert run.is_complete
+
+    def test_complete_with_some_batches_failed(self) -> None:
+        """A run completes when all batches are accounted for, even if some failed."""
+        run = _make_run(
+            status=IngestStatus.RUNNING,
+            ingestion_finished=True,
+            batches_ingested=3,
+            batches_completed=2,
+            batches_failed=1,
+        )
+        assert run.is_complete
+
+    def test_complete_with_all_batches_failed(self) -> None:
+        run = _make_run(
+            status=IngestStatus.RUNNING,
+            ingestion_finished=True,
+            batches_ingested=3,
+            batches_completed=0,
+            batches_failed=3,
+        )
+        assert run.is_complete
+
+    def test_not_complete_when_batches_still_pending(self) -> None:
+        run = _make_run(
+            status=IngestStatus.RUNNING,
+            ingestion_finished=True,
+            batches_ingested=3,
+            batches_completed=1,
+            batches_failed=1,
+        )
+        assert not run.is_complete
+
+    def test_not_complete_when_ingestion_not_finished(self) -> None:
+        run = _make_run(
+            status=IngestStatus.RUNNING,
+            ingestion_finished=False,
+            batches_ingested=3,
+            batches_completed=0,
+            batches_failed=3,
+        )
+        assert not run.is_complete
+
+    def test_check_completion_transitions_with_failures(self) -> None:
+        run = _make_run(
+            status=IngestStatus.RUNNING,
+            ingestion_finished=True,
+            batches_ingested=3,
+            batches_completed=2,
+            batches_failed=1,
+        )
+        now = datetime.now(UTC)
+        completed = run.check_completion(now)
+        assert completed is True
+        assert run.status == IngestStatus.COMPLETED
+        assert run.completed_at == now
diff --git a/server/tests/unit/domain/ingest/test_ingest_service.py b/server/tests/unit/domain/ingest/test_ingest_service.py
index 4742ec6..1bb9b0f 100644
--- a/server/tests/unit/domain/ingest/test_ingest_service.py
+++ b/server/tests/unit/domain/ingest/test_ingest_service.py
@@ -63,19 +63,25 @@ async def test_creates_pending_ingest(self) -> None:
         assert run.batch_size == 1000
 
     @pytest.mark.asyncio
-    async def test_saves_and_emits_event(self) -> None:
+    async def test_saves_and_emits_events(self) -> None:
         service = _make_service()
         run = await service.start_ingest(
             convention_srn="urn:osa:localhost:conv:test-conv@1.0.0",
         )
         service.ingest_repo.save.assert_called_once()
-        service.outbox.append.assert_called_once()
-
-        # Verify the event is IngestStarted
-        event = service.outbox.append.call_args[0][0]
-        assert event.__class__.__name__ == "IngestStarted"
-        assert event.ingest_run_srn == run.srn
-        assert event.convention_srn == run.convention_srn
+        assert service.outbox.append.call_count == 2
+
+        # First event: IngestRunStarted (observability)
+        first_event = service.outbox.append.call_args_list[0][0][0]
+        assert first_event.__class__.__name__ == "IngestRunStarted"
+        assert first_event.ingest_run_srn == run.srn
+        assert first_event.convention_srn == run.convention_srn
+
+        # Second event: NextBatchRequested (triggers first batch)
+        second_event = service.outbox.append.call_args_list[1][0][0]
+        assert second_event.__class__.__name__ == "NextBatchRequested"
+        assert second_event.ingest_run_srn == run.srn
+        assert second_event.convention_srn == run.convention_srn
 
     @pytest.mark.asyncio
     async def test_custom_batch_size(self) -> None:
diff --git a/server/tests/unit/domain/shared/test_error_types.py b/server/tests/unit/domain/shared/test_error_types.py
new file mode 100644
index 0000000..05aca9f
--- /dev/null
+++ b/server/tests/unit/domain/shared/test_error_types.py
@@ -0,0 +1,45 @@
+"""Tests for runner-specific error types."""
+
+from osa.domain.shared.error import (
+    InfrastructureError,
+    OOMError,
+    PermanentError,
+    TransientError,
+)
+
+
+class TestOOMError:
+    def test_is_infrastructure_error(self):
+        err = OOMError("Hook killed by OOM")
+        assert isinstance(err, InfrastructureError)
+
+    def test_is_permanent_error(self):
+        err = OOMError("Hook killed by OOM")
+        assert isinstance(err, PermanentError)
+
+    def test_message_and_code(self):
+        err = OOMError("Hook killed by OOM")
+        assert err.message == "Hook killed by OOM"
+        assert err.code == "OOMError"
+
+
+class TestTransientError:
+    def test_is_infrastructure_error(self):
+        err = TransientError("Pod scheduling timeout")
+        assert isinstance(err, InfrastructureError)
+
+    def test_message_and_code(self):
+        err = TransientError("Pod scheduling timeout after 120s")
+        assert err.message == "Pod scheduling timeout after 120s"
+        assert err.code == "TransientError"
+
+
+class TestPermanentError:
+    def test_is_infrastructure_error(self):
+        err = PermanentError("Image pull failed")
+        assert isinstance(err, InfrastructureError)
+
+    def test_message_and_code(self):
+        err = PermanentError("Image pull failed: ImagePullBackOff")
+        assert err.message == "Image pull failed: ImagePullBackOff"
+        assert err.code == "PermanentError"
diff --git a/server/tests/unit/domain/shared/test_outbox_claim.py b/server/tests/unit/domain/shared/test_outbox_claim.py
index 85b90c7..7326bb2 100644
--- a/server/tests/unit/domain/shared/test_outbox_claim.py
+++ b/server/tests/unit/domain/shared/test_outbox_claim.py
@@ -147,5 +147,5 @@ async def test_mark_failed_with_retry_delegates_to_repo(
         await outbox.mark_failed_with_retry(delivery_id, error, max_retries=3)
 
         mock_repo.mark_failed_with_retry.assert_called_once_with(
-            delivery_id, error=error, max_retries=3
+            delivery_id, error=error, max_retries=3, deliver_after=None
         )
diff --git a/server/tests/unit/domain/shared/test_outbox_consumer_groups.py b/server/tests/unit/domain/shared/test_outbox_consumer_groups.py
index 9964494..5958ff1 100644
--- a/server/tests/unit/domain/shared/test_outbox_consumer_groups.py
+++ b/server/tests/unit/domain/shared/test_outbox_consumer_groups.py
@@ -186,7 +186,7 @@ async def test_mark_failed_with_retry_uses_delivery_id(
         await outbox.mark_failed_with_retry(delivery_id, error="Timeout", max_retries=3)
 
         mock_repo.mark_failed_with_retry.assert_called_once_with(
-            delivery_id, error="Timeout", max_retries=3
+            delivery_id, error="Timeout", max_retries=3, deliver_after=None
         )
 
 
diff --git a/server/tests/unit/domain/validation/test_hook_result.py b/server/tests/unit/domain/validation/test_hook_result.py
index 8465ff5..d245e74 100644
--- a/server/tests/unit/domain/validation/test_hook_result.py
+++ b/server/tests/unit/domain/validation/test_hook_result.py
@@ -6,7 +6,14 @@ def test_hook_status_values():
 
     assert HookStatus.PASSED == "passed"
     assert HookStatus.REJECTED == "rejected"
-    assert HookStatus.FAILED == "failed"
+
+
+def test_hook_status_only_business_outcomes():
+    """HookStatus should only carry business outcomes, not failure modes."""
+    from osa.domain.validation.model.hook_result import HookStatus
+
+    members = set(HookStatus)
+    assert members == {HookStatus.PASSED, HookStatus.REJECTED}
 
 
 def test_progress_entry_full():
@@ -61,19 +68,6 @@ def test_hook_result_rejected():
     assert result.rejection_reason == "Missing coordinates"
 
 
-def test_hook_result_failed():
-    from osa.domain.validation.model.hook_result import HookResult, HookStatus
-
-    result = HookResult(
-        hook_name="detect_pockets",
-        status=HookStatus.FAILED,
-        error_message="OOM killed",
-        duration_seconds=300.0,
-    )
-    assert result.status == HookStatus.FAILED
-    assert result.error_message == "OOM killed"
-
-
 def test_hook_result_with_progress():
     from osa.domain.validation.model.hook_result import (
         HookResult,
@@ -105,44 +99,6 @@ def test_hook_result_default_progress_empty():
     assert result.progress == []
 
 
-def test_hook_status_oom_value():
-    from osa.domain.validation.model.hook_result import HookStatus
-
-    assert HookStatus.OOM == "oom"
-    assert HookStatus.OOM.value == "oom"
-
-
-def test_hook_result_oom_killed_true():
-    from osa.domain.validation.model.hook_result import HookResult, HookStatus
-
-    result = HookResult(
-        hook_name="detect_pockets",
-        status=HookStatus.OOM,
-        error_message="Hook killed by OOM (limit: 1g)",
-        duration_seconds=30.0,
-    )
-    assert result.oom_killed is True
-
-
-def test_hook_result_oom_killed_false():
-    from osa.domain.validation.model.hook_result import HookResult, HookStatus
-
-    result = HookResult(
-        hook_name="detect_pockets",
-        status=HookStatus.FAILED,
-        error_message="Some other error",
-        duration_seconds=10.0,
-    )
-    assert result.oom_killed is False
-
-    passed = HookResult(
-        hook_name="detect_pockets",
-        status=HookStatus.PASSED,
-        duration_seconds=5.0,
-    )
-    assert passed.oom_killed is False
-
-
 def test_hook_result_serialization_roundtrip():
     from osa.domain.validation.model.hook_result import (
         HookResult,
diff --git a/server/tests/unit/domain/validation/test_hook_service.py b/server/tests/unit/domain/validation/test_hook_service.py
index a24b741..7b1b4b9 100644
--- a/server/tests/unit/domain/validation/test_hook_service.py
+++ b/server/tests/unit/domain/validation/test_hook_service.py
@@ -19,6 +19,7 @@
     OutcomeStatus,
 )
 from osa.domain.validation.model.hook_input import HookRecord
+from osa.domain.shared.error import OOMError
 from osa.domain.validation.model.hook_result import HookResult, HookStatus
 from osa.domain.validation.port.hook_runner import HookInputs
 
@@ -50,22 +51,8 @@ def _passed_result(hook_name: str = "detect_pockets", duration: float = 5.0) ->
     return HookResult(hook_name=hook_name, status=HookStatus.PASSED, duration_seconds=duration)
 
 
-def _oom_result(hook_name: str = "detect_pockets", duration: float = 30.0) -> HookResult:
-    return HookResult(
-        hook_name=hook_name,
-        status=HookStatus.OOM,
-        error_message="Hook killed by OOM",
-        duration_seconds=duration,
-    )
-
-
-def _failed_result(hook_name: str = "detect_pockets", duration: float = 10.0) -> HookResult:
-    return HookResult(
-        hook_name=hook_name,
-        status=HookStatus.FAILED,
-        error_message="Some error",
-        duration_seconds=duration,
-    )
+def _oom_error() -> OOMError:
+    return OOMError("Hook killed by OOM")
 
 
 class FakeHookStorage:
@@ -164,7 +151,7 @@ async def mock_run(h, inputs, wd):
                 features_file.write_text(
                     json.dumps({"id": records[0].id, "features": [{"score": 0.5}]}) + "\n"
                 )
-                return _oom_result()
+                raise _oom_error()
             else:
                 # Second call: succeed with remaining
                 features_file = output_dir / "features.jsonl"
@@ -188,7 +175,7 @@ async def mock_run(h, inputs, wd):
 
 
 class TestHookServiceOOMExhaustion:
-    """T017: OOM exhaustion marks remaining records as errored."""
+    """T017: OOM exhaustion marks remaining records as errored and re-raises."""
 
     @pytest.mark.asyncio
     async def test_oom_exhaustion_marks_errored(self, tmp_path: Path):
@@ -202,13 +189,13 @@ async def test_oom_exhaustion_marks_errored(self, tmp_path: Path):
         output_dir.mkdir(parents=True)
 
         runner = AsyncMock()
-        runner.run.return_value = _oom_result()
+        runner.run.side_effect = _oom_error()
         storage = FakeHookStorage()
 
         service = HookService(hook_runner=runner, hook_storage=storage)
-        result = await service.run_hook(hook, _inputs(records), work_dir)
+        with pytest.raises(OOMError):
+            await service.run_hook(hook, _inputs(records), work_dir)
 
-        assert result.status == HookStatus.OOM
         # Should have retried MAX_OOM_RETRIES times
         assert runner.run.call_count == 4  # 1 initial + 3 retries
 
@@ -221,10 +208,11 @@ async def test_oom_exhaustion_marks_errored(self, tmp_path: Path):
 
 
 class TestHookServiceNonOOMFailure:
-    """T018: Non-OOM failure does NOT trigger retry."""
+    """T018: Non-OOM failure propagates without retry."""
 
     @pytest.mark.asyncio
     async def test_non_oom_failure_no_retry(self, tmp_path: Path):
+        from osa.domain.shared.error import PermanentError
         from osa.domain.validation.service.hook import HookService
 
         hook = _make_hook()
@@ -234,13 +222,13 @@ async def test_non_oom_failure_no_retry(self, tmp_path: Path):
         (work_dir / "output").mkdir(parents=True)
 
         runner = AsyncMock()
-        runner.run.return_value = _failed_result()
+        runner.run.side_effect = PermanentError("Hook exited with code 1")
         storage = FakeHookStorage()
 
         service = HookService(hook_runner=runner, hook_storage=storage)
-        result = await service.run_hook(hook, _inputs(records), work_dir)
+        with pytest.raises(PermanentError):
+            await service.run_hook(hook, _inputs(records), work_dir)
 
-        assert result.status == HookStatus.FAILED
         runner.run.assert_called_once()
 
 
@@ -338,7 +326,7 @@ async def side_effect(h, inputs, wd):
             if h.name == "hook_one":
                 return _passed_result(hook_name="hook_one")
             else:
-                return _oom_result(hook_name="hook_two")
+                raise _oom_error()
 
         runner.run.side_effect = side_effect
 
@@ -348,13 +336,13 @@ async def side_effect(h, inputs, wd):
         r1 = await service.run_hook(hook1, _inputs(records), work_dir1)
         assert r1.status == HookStatus.PASSED
 
-        # Run hook 2 — should OOM and exhaust retries
-        r2 = await service.run_hook(hook2, _inputs(records), work_dir2)
-        assert r2.status == HookStatus.OOM
+        # Run hook 2 — should OOM and exhaust retries, then raise
+        with pytest.raises(OOMError):
+            await service.run_hook(hook2, _inputs(records), work_dir2)
 
         # Hook 1 was called once, hook 2 was called 4 times (1 + 3 retries)
         hook1_calls = [c for c in runner.run.call_args_list if c[0][0].name == "hook_one"]
-        hook2_calls = [c for c in runner.run.call_args_list if c[0][0].name == "hook_two"]
+        hook2_calls = [c for c in runner.run.call_args_list if c[0][0].name != "hook_one"]
         assert len(hook1_calls) == 1
         assert len(hook2_calls) == 4
 
diff --git a/server/tests/unit/domain/validation/test_validation_service.py b/server/tests/unit/domain/validation/test_validation_service.py
index 2589b7d..2b73881 100644
--- a/server/tests/unit/domain/validation/test_validation_service.py
+++ b/server/tests/unit/domain/validation/test_validation_service.py
@@ -13,6 +13,7 @@
 )
 from osa.domain.shared.model.srn import DepositionSRN, Domain
 from osa.domain.validation.model import RunStatus
+from osa.domain.shared.error import OOMError, PermanentError
 from osa.domain.validation.model.hook_result import HookResult, HookStatus
 from osa.domain.validation.model.hook_input import HookRecord
 from osa.domain.validation.port.hook_runner import HookInputs
@@ -134,9 +135,7 @@ async def test_hook_rejected_halts_pipeline(self):
     @pytest.mark.asyncio
     async def test_hook_failed_halts_pipeline(self):
         hook_runner = AsyncMock()
-        hook_runner.run.return_value = _make_hook_result(
-            status=HookStatus.FAILED,
-        )
+        hook_runner.run.side_effect = PermanentError("Hook exited with code 1")
         service = _make_service(hook_runner=hook_runner)
         run = await service.create_run(inputs=_make_inputs())
 
@@ -204,7 +203,7 @@ async def run_hook(hook, inputs, output_dir):
     async def test_validation_service_halts_on_oom(self):
         """REGRESSION: OOM with exhausted retries should halt pipeline as FAILED."""
         hook_runner = AsyncMock()
-        hook_runner.run.return_value = _make_hook_result(status=HookStatus.OOM)
+        hook_runner.run.side_effect = OOMError("Hook killed by OOM")
         service = _make_service(hook_runner=hook_runner)
         run = await service.create_run(inputs=_make_inputs())
 
@@ -226,12 +225,7 @@ async def run_hook(hook, inputs, output_dir):
             nonlocal call_count
             call_count += 1
             if call_count == 1:
-                return HookResult(
-                    hook_name=hook.name,
-                    status=HookStatus.OOM,
-                    error_message="OOM",
-                    duration_seconds=30.0,
-                )
+                raise OOMError("Hook killed by OOM")
             return HookResult(
                 hook_name=hook.name,
                 status=HookStatus.PASSED,
diff --git a/server/tests/unit/infrastructure/event/test_worker.py b/server/tests/unit/infrastructure/event/test_worker.py
index 4723ab0..84aaac3 100644
--- a/server/tests/unit/infrastructure/event/test_worker.py
+++ b/server/tests/unit/infrastructure/event/test_worker.py
@@ -358,9 +358,12 @@ async def test_worker_handles_handler_error(self):
         # Act - Run one poll cycle
         await worker._poll_once()
 
-        # Assert - Event should be marked as failed using delivery_id
-        outbox.mark_failed_with_retry.assert_called_once_with(
-            delivery_id, "Processing failed", max_retries=3
-        )
+        # Assert - Event should be marked as failed using delivery_id with backoff
+        outbox.mark_failed_with_retry.assert_called_once()
+        call_args = outbox.mark_failed_with_retry.call_args
+        assert call_args[0][0] == delivery_id
+        assert call_args[0][1] == "Processing failed"
+        assert call_args[1]["max_retries"] == 3
+        assert call_args[1]["deliver_after"] is not None
         assert worker.state.failed_count == 1
         assert worker.state.error is not None
diff --git a/server/tests/unit/infrastructure/k8s/test_classify_api_error.py b/server/tests/unit/infrastructure/k8s/test_classify_api_error.py
index c9d3096..64beba5 100644
--- a/server/tests/unit/infrastructure/k8s/test_classify_api_error.py
+++ b/server/tests/unit/infrastructure/k8s/test_classify_api_error.py
@@ -1,6 +1,6 @@
 """Tests for K8s API error classification."""
 
-from osa.domain.shared.error import ConfigurationError, InfrastructureError
+from osa.domain.shared.error import PermanentError, TransientError
 from osa.infrastructure.k8s.errors import classify_api_error
 
 
@@ -14,33 +14,33 @@ def __init__(self, status: int, reason: str = ""):
 
 
 class TestClassifyApiError:
-    def test_403_returns_configuration_error(self):
+    def test_403_returns_permanent_runner_error(self):
         exc = _FakeApiException(403, "Forbidden")
         result = classify_api_error(exc)
-        assert isinstance(result, ConfigurationError)
+        assert isinstance(result, PermanentError)
         assert "RBAC" in result.message or "permission" in result.message.lower()
 
-    def test_404_returns_configuration_error(self):
+    def test_404_returns_permanent_runner_error(self):
         exc = _FakeApiException(404, "Not Found")
         result = classify_api_error(exc)
-        assert isinstance(result, ConfigurationError)
+        assert isinstance(result, PermanentError)
 
-    def test_500_returns_infrastructure_error(self):
+    def test_500_returns_transient_resource_error(self):
         exc = _FakeApiException(500, "Internal Server Error")
         result = classify_api_error(exc)
-        assert isinstance(result, InfrastructureError)
+        assert isinstance(result, TransientError)
 
-    def test_503_returns_infrastructure_error(self):
+    def test_503_returns_transient_resource_error(self):
         exc = _FakeApiException(503, "Service Unavailable")
         result = classify_api_error(exc)
-        assert isinstance(result, InfrastructureError)
+        assert isinstance(result, TransientError)
 
-    def test_409_returns_infrastructure_error(self):
+    def test_409_returns_transient_resource_error(self):
         exc = _FakeApiException(409, "Conflict")
         result = classify_api_error(exc)
-        assert isinstance(result, InfrastructureError)
+        assert isinstance(result, TransientError)
 
-    def test_unknown_status_returns_infrastructure_error(self):
+    def test_unknown_status_returns_transient_resource_error(self):
         exc = _FakeApiException(429, "Too Many Requests")
         result = classify_api_error(exc)
-        assert isinstance(result, InfrastructureError)
+        assert isinstance(result, TransientError)
diff --git a/server/tests/unit/infrastructure/k8s/test_k8s_hook_runner.py b/server/tests/unit/infrastructure/k8s/test_k8s_hook_runner.py
index 9e63c87..d03b972 100644
--- a/server/tests/unit/infrastructure/k8s/test_k8s_hook_runner.py
+++ b/server/tests/unit/infrastructure/k8s/test_k8s_hook_runner.py
@@ -6,7 +6,11 @@
 import pytest
 
 from osa.config import K8sConfig
-from osa.domain.shared.error import InfrastructureError
+from osa.domain.shared.error import (
+    OOMError,
+    PermanentError,
+    TransientError,
+)
 from osa.domain.shared.model.hook import (
     ColumnDef,
     HookDefinition,
@@ -70,7 +74,10 @@ def _make_s3_mock() -> AsyncMock:
 def _make_runner(config: K8sConfig | None = None) -> K8sHookRunner:
     api_client = MagicMock()
     s3 = _make_s3_mock()
-    return K8sHookRunner(api_client=api_client, config=config or _make_config(), s3=s3)
+    runner = K8sHookRunner(api_client=api_client, config=config or _make_config(), s3=s3)
+    runner._batch_api = AsyncMock()
+    runner._core_api = AsyncMock()
+    return runner
 
 
 # ---------------------------------------------------------------------------
@@ -331,6 +338,7 @@ class TestSchedulingWatch:
     async def test_pod_leaves_pending_quickly(self):
         runner = _make_runner()
         core_api = AsyncMock()
+        runner._core_api = core_api
 
         # Pod transitions from Pending to Running
         pod = MagicMock()
@@ -340,12 +348,13 @@ async def test_pod_leaves_pending_quickly(self):
         pod_list.items = [pod]
         core_api.list_namespaced_pod.return_value = pod_list
 
-        await runner._wait_for_scheduling(core_api, "test-job", "osa")
+        await runner._wait_for_scheduling("test-job", "osa")
 
     @pytest.mark.asyncio
     async def test_pod_stuck_scheduling_timeout(self):
         runner = _make_runner()
         core_api = AsyncMock()
+        runner._core_api = core_api
 
         # Pod stays in Pending
         pod = MagicMock()
@@ -355,15 +364,16 @@ async def test_pod_stuck_scheduling_timeout(self):
         pod_list.items = [pod]
         core_api.list_namespaced_pod.return_value = pod_list
 
-        with pytest.raises(InfrastructureError, match="scheduling"):
+        with pytest.raises(TransientError, match="scheduling"):
             await runner._wait_for_scheduling(
-                core_api, "test-job", "osa", timeout_seconds=0.1, poll_interval=0.05
+                "test-job", "osa", timeout_seconds=0.1, poll_interval=0.05
             )
 
     @pytest.mark.asyncio
     async def test_image_pull_backoff_fails_fast(self):
         runner = _make_runner()
         core_api = AsyncMock()
+        runner._core_api = core_api
 
         pod = MagicMock()
         pod.status.phase = "Pending"
@@ -375,13 +385,14 @@ async def test_image_pull_backoff_fails_fast(self):
         pod_list.items = [pod]
         core_api.list_namespaced_pod.return_value = pod_list
 
-        with pytest.raises(InfrastructureError, match="[Ii]mage pull"):
-            await runner._wait_for_scheduling(core_api, "test-job", "osa")
+        with pytest.raises(PermanentError, match="[Ii]mage pull"):
+            await runner._wait_for_scheduling("test-job", "osa")
 
     @pytest.mark.asyncio
     async def test_err_image_pull_fails_fast(self):
         runner = _make_runner()
         core_api = AsyncMock()
+        runner._core_api = core_api
 
         pod = MagicMock()
         pod.status.phase = "Pending"
@@ -393,13 +404,14 @@ async def test_err_image_pull_fails_fast(self):
         pod_list.items = [pod]
         core_api.list_namespaced_pod.return_value = pod_list
 
-        with pytest.raises(InfrastructureError, match="[Ii]mage pull"):
-            await runner._wait_for_scheduling(core_api, "test-job", "osa")
+        with pytest.raises(PermanentError, match="[Ii]mage pull"):
+            await runner._wait_for_scheduling("test-job", "osa")
 
     @pytest.mark.asyncio
     async def test_pod_evicted(self):
         runner = _make_runner()
         core_api = AsyncMock()
+        runner._core_api = core_api
 
         pod = MagicMock()
         pod.status.phase = "Failed"
@@ -409,8 +421,8 @@ async def test_pod_evicted(self):
         pod_list.items = [pod]
         core_api.list_namespaced_pod.return_value = pod_list
 
-        with pytest.raises(InfrastructureError, match="[Ee]vict"):
-            await runner._wait_for_scheduling(core_api, "test-job", "osa")
+        with pytest.raises(TransientError, match="[Ee]vict"):
+            await runner._wait_for_scheduling("test-job", "osa")
 
 
 # ---------------------------------------------------------------------------
@@ -427,6 +439,8 @@ async def test_successful_run(self, tmp_path: Path):
 
         batch_api = AsyncMock()
         core_api = AsyncMock()
+        runner._batch_api = batch_api
+        runner._core_api = core_api
 
         # No existing jobs (orphan check)
         job_list = MagicMock()
@@ -462,13 +476,7 @@ async def test_successful_run(self, tmp_path: Path):
         )
 
         inputs = HookInputs(records=[HookRecord(id="test", metadata={})], run_id=_RUN_ID)
-        result = await runner._run_job(
-            batch_api,
-            core_api,
-            hook,
-            inputs,
-            work_dir,
-        )
+        result = await runner._run_job(hook, inputs, work_dir)
 
         assert result.status == HookStatus.PASSED
         assert len(result.progress) == 1
@@ -482,6 +490,8 @@ async def test_timeout_deadline_exceeded(self, tmp_path: Path):
 
         batch_api = AsyncMock()
         core_api = AsyncMock()
+        runner._batch_api = batch_api
+        runner._core_api = core_api
 
         job_list = MagicMock()
         job_list.items = []
@@ -512,19 +522,12 @@ async def test_timeout_deadline_exceeded(self, tmp_path: Path):
         work_dir.mkdir(parents=True)
         inputs = HookInputs(records=[HookRecord(id="test", metadata={})], run_id=_RUN_ID)
 
-        result = await runner._run_job(
-            batch_api,
-            core_api,
-            hook,
-            inputs,
-            work_dir,
-        )
-
-        assert result.status == HookStatus.FAILED
-        assert (
-            "timed out" in result.error_message.lower()
-            or "deadline" in result.error_message.lower()
-        )
+        with pytest.raises(TransientError, match="[Tt]imed out|[Dd]eadline"):
+            await runner._run_job(
+                hook,
+                inputs,
+                work_dir,
+            )
         batch_api.delete_namespaced_job.assert_called_once()
 
     @pytest.mark.asyncio
@@ -534,6 +537,8 @@ async def test_oom_exit_137(self, tmp_path: Path):
 
         batch_api = AsyncMock()
         core_api = AsyncMock()
+        runner._batch_api = batch_api
+        runner._core_api = core_api
 
         job_list = MagicMock()
         job_list.items = []
@@ -577,16 +582,12 @@ async def test_oom_exit_137(self, tmp_path: Path):
         work_dir.mkdir(parents=True)
         inputs = HookInputs(records=[HookRecord(id="test", metadata={})], run_id=_RUN_ID)
 
-        result = await runner._run_job(
-            batch_api,
-            core_api,
-            hook,
-            inputs,
-            work_dir,
-        )
-
-        assert result.status == HookStatus.OOM
-        assert "oom" in result.error_message.lower()
+        with pytest.raises(OOMError, match="[Oo][Oo][Mm]"):
+            await runner._run_job(
+                hook,
+                inputs,
+                work_dir,
+            )
 
     @pytest.mark.asyncio
     async def test_nonzero_exit(self, tmp_path: Path):
@@ -595,6 +596,8 @@ async def test_nonzero_exit(self, tmp_path: Path):
 
         batch_api = AsyncMock()
         core_api = AsyncMock()
+        runner._batch_api = batch_api
+        runner._core_api = core_api
 
         job_list = MagicMock()
         job_list.items = []
@@ -636,16 +639,12 @@ async def test_nonzero_exit(self, tmp_path: Path):
         work_dir.mkdir(parents=True)
         inputs = HookInputs(records=[HookRecord(id="test", metadata={})], run_id=_RUN_ID)
 
-        result = await runner._run_job(
-            batch_api,
-            core_api,
-            hook,
-            inputs,
-            work_dir,
-        )
-
-        assert result.status == HookStatus.FAILED
-        assert "exit" in result.error_message.lower()
+        with pytest.raises(PermanentError, match="[Ee]xit"):
+            await runner._run_job(
+                hook,
+                inputs,
+                work_dir,
+            )
 
     @pytest.mark.asyncio
     async def test_orphan_running_job_attaches(self, tmp_path: Path):
@@ -655,6 +654,8 @@ async def test_orphan_running_job_attaches(self, tmp_path: Path):
 
         batch_api = AsyncMock()
         core_api = AsyncMock()
+        runner._batch_api = batch_api
+        runner._core_api = core_api
 
         # Existing active job
         existing_job = MagicMock()
@@ -691,8 +692,6 @@ async def test_orphan_running_job_attaches(self, tmp_path: Path):
         inputs = HookInputs(records=[HookRecord(id="test", metadata={})], run_id=_RUN_ID)
 
         result = await runner._run_job(
-            batch_api,
-            core_api,
             hook,
             inputs,
             work_dir,
@@ -710,6 +709,8 @@ async def test_orphan_completed_job_reads_output(self, tmp_path: Path):
 
         batch_api = AsyncMock()
         core_api = AsyncMock()
+        runner._batch_api = batch_api
+        runner._core_api = core_api
 
         existing_job = MagicMock()
         existing_job.metadata.name = "osa-hook-existing"
@@ -727,8 +728,6 @@ async def test_orphan_completed_job_reads_output(self, tmp_path: Path):
         inputs = HookInputs(records=[HookRecord(id="test", metadata={})], run_id=_RUN_ID)
 
         result = await runner._run_job(
-            batch_api,
-            core_api,
             hook,
             inputs,
             work_dir,
@@ -745,6 +744,8 @@ async def test_orphan_failed_job_creates_new(self, tmp_path: Path):
 
         batch_api = AsyncMock()
         core_api = AsyncMock()
+        runner._batch_api = batch_api
+        runner._core_api = core_api
 
         existing_job = MagicMock()
         existing_job.metadata.name = "osa-hook-existing"
@@ -782,8 +783,6 @@ async def test_orphan_failed_job_creates_new(self, tmp_path: Path):
         inputs = HookInputs(records=[HookRecord(id="test", metadata={})], run_id=_RUN_ID)
 
         result = await runner._run_job(
-            batch_api,
-            core_api,
             hook,
             inputs,
             work_dir,
@@ -807,7 +806,7 @@ class FakeNotFound(Exception):
         batch_api.delete_namespaced_job.side_effect = FakeNotFound()
 
         # Should not raise
-        await runner._cleanup_job(batch_api, "test-job", "osa")
+        await runner._cleanup_job("test-job", "osa")
 
     @pytest.mark.asyncio
     async def test_rejection_via_progress(self, tmp_path: Path):
@@ -817,6 +816,8 @@ async def test_rejection_via_progress(self, tmp_path: Path):
 
         batch_api = AsyncMock()
         core_api = AsyncMock()
+        runner._batch_api = batch_api
+        runner._core_api = core_api
 
         job_list = MagicMock()
         job_list.items = []
@@ -847,8 +848,6 @@ async def test_rejection_via_progress(self, tmp_path: Path):
         inputs = HookInputs(records=[HookRecord(id="test", metadata={})], run_id=_RUN_ID)
 
         result = await runner._run_job(
-            batch_api,
-            core_api,
             hook,
             inputs,
             work_dir,
@@ -869,13 +868,13 @@ class TestRunIdFromInputs:
     @pytest.mark.asyncio
     async def test_run_uses_run_id_from_inputs(self, tmp_path: Path):
         """The run_id in Job labels comes from inputs, not the work_dir path."""
-        from unittest.mock import patch
-
         config = _make_config(data_mount_path=str(tmp_path))
         runner = K8sHookRunner(api_client=MagicMock(), config=config, s3=_make_s3_mock())
 
         batch_api = AsyncMock()
         core_api = AsyncMock()
+        runner._batch_api = batch_api
+        runner._core_api = core_api
 
         # No existing jobs
         job_list = MagicMock()
@@ -909,11 +908,7 @@ async def test_run_uses_run_id_from_inputs(self, tmp_path: Path):
             run_id="my-real-run-id",
         )
 
-        with (
-            patch("kubernetes_asyncio.client.BatchV1Api", return_value=batch_api),
-            patch("kubernetes_asyncio.client.CoreV1Api", return_value=core_api),
-        ):
-            await runner.run(hook, inputs, work_dir)
+        await runner.run(hook, inputs, work_dir)
 
         # Verify the Job was created with the run_id from inputs
         call_args = batch_api.create_namespaced_job.call_args
diff --git a/server/tests/unit/infrastructure/k8s/test_k8s_ingester_runner.py b/server/tests/unit/infrastructure/k8s/test_k8s_ingester_runner.py
index 398905e..c6e54b6 100644
--- a/server/tests/unit/infrastructure/k8s/test_k8s_ingester_runner.py
+++ b/server/tests/unit/infrastructure/k8s/test_k8s_ingester_runner.py
@@ -7,7 +7,7 @@
 import pytest
 
 from osa.config import K8sConfig
-from osa.domain.shared.error import ExternalServiceError
+from osa.domain.shared.error import OOMError, TransientError
 from osa.domain.shared.model.source import IngesterDefinition, IngesterLimits
 from osa.domain.shared.model.srn import ConventionSRN
 from osa.domain.shared.port.ingester_runner import IngesterInputs
@@ -214,6 +214,8 @@ async def test_successful_run_with_records(self, tmp_path: Path):
 
         batch_api = AsyncMock()
         core_api = AsyncMock()
+        runner._batch_api = batch_api
+        runner._core_api = core_api
 
         # No existing jobs
         job_list = MagicMock()
@@ -259,14 +261,7 @@ async def s3_get(key: str) -> bytes:
         runner._s3.get_object.side_effect = s3_get
 
         inputs = IngesterInputs(convention_srn=_CONV_SRN)
-        result = await runner._run_job(
-            batch_api,
-            core_api,
-            ingester,
-            inputs,
-            work_dir,
-            files_dir,
-        )
+        result = await runner._run_job(ingester, inputs, work_dir, files_dir)
 
         assert len(result.records) == 2
         assert result.session == {"cursor": "abc"}
@@ -274,12 +269,14 @@ async def s3_get(key: str) -> bytes:
         batch_api.delete_namespaced_job.assert_called_once()
 
     @pytest.mark.asyncio
-    async def test_timeout_raises_external_service_error(self, tmp_path: Path):
+    async def test_timeout_raises_transient_error(self, tmp_path: Path):
         config = _make_config(data_mount_path=str(tmp_path))
         runner = K8sIngesterRunner(api_client=MagicMock(), config=config, s3=_make_s3_mock())
 
         batch_api = AsyncMock()
         core_api = AsyncMock()
+        runner._batch_api = batch_api
+        runner._core_api = core_api
 
         job_list = MagicMock()
         job_list.items = []
@@ -311,23 +308,18 @@ async def test_timeout_raises_external_service_error(self, tmp_path: Path):
         files_dir.mkdir(parents=True)
         inputs = IngesterInputs(convention_srn=_CONV_SRN)
 
-        with pytest.raises(ExternalServiceError, match="[Tt]imed out|[Dd]eadline"):
-            await runner._run_job(
-                batch_api,
-                core_api,
-                ingester,
-                inputs,
-                work_dir,
-                files_dir,
-            )
+        with pytest.raises(TransientError, match="[Tt]imed out|[Dd]eadline"):
+            await runner._run_job(ingester, inputs, work_dir, files_dir)
 
     @pytest.mark.asyncio
-    async def test_oom_raises_external_service_error(self, tmp_path: Path):
+    async def test_oom_raises_oom_error(self, tmp_path: Path):
         config = _make_config(data_mount_path=str(tmp_path))
         runner = K8sIngesterRunner(api_client=MagicMock(), config=config, s3=_make_s3_mock())
 
         batch_api = AsyncMock()
         core_api = AsyncMock()
+        runner._batch_api = batch_api
+        runner._core_api = core_api
 
         job_list = MagicMock()
         job_list.items = []
@@ -371,15 +363,8 @@ async def test_oom_raises_external_service_error(self, tmp_path: Path):
         files_dir.mkdir(parents=True)
         inputs = IngesterInputs(convention_srn=_CONV_SRN)
 
-        with pytest.raises(ExternalServiceError, match="[Oo]OM"):
-            await runner._run_job(
-                batch_api,
-                core_api,
-                ingester,
-                inputs,
-                work_dir,
-                files_dir,
-            )
+        with pytest.raises(OOMError, match="[Oo]OM"):
+            await runner._run_job(ingester, inputs, work_dir, files_dir)
 
 
 # ---------------------------------------------------------------------------
@@ -392,13 +377,13 @@ class TestConventionSrnFromInputs:
 
     @pytest.mark.asyncio
     async def test_run_uses_convention_srn_from_inputs(self, tmp_path: Path):
-        from unittest.mock import patch
-
         config = _make_config(data_mount_path=str(tmp_path))
         runner = K8sIngesterRunner(api_client=MagicMock(), config=config, s3=_make_s3_mock())
 
         batch_api = AsyncMock()
         core_api = AsyncMock()
+        runner._batch_api = batch_api
+        runner._core_api = core_api
 
         # No existing jobs
         job_list = MagicMock()
@@ -432,11 +417,7 @@ async def test_run_uses_convention_srn_from_inputs(self, tmp_path: Path):
             convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:my-conv@1.0.0")
         )
 
-        with (
-            patch("kubernetes_asyncio.client.BatchV1Api", return_value=batch_api),
-            patch("kubernetes_asyncio.client.CoreV1Api", return_value=core_api),
-        ):
-            await runner.run(ingester, inputs, files_dir, work_dir)
+        await runner.run(ingester, inputs, files_dir, work_dir)
 
         # Verify convention_srn from inputs ends up in the Job labels
         call_args = batch_api.create_namespaced_job.call_args
diff --git a/server/tests/unit/infrastructure/test_oci_hook_runner.py b/server/tests/unit/infrastructure/test_oci_hook_runner.py
index 35352bb..33e20a8 100644
--- a/server/tests/unit/infrastructure/test_oci_hook_runner.py
+++ b/server/tests/unit/infrastructure/test_oci_hook_runner.py
@@ -5,6 +5,7 @@
 
 import pytest
 
+from osa.domain.shared.error import OOMError, PermanentError, TransientError
 from osa.domain.shared.model.hook import (
     ColumnDef,
     HookDefinition,
@@ -233,13 +234,11 @@ async def test_nonzero_exit_returns_failed(self, tmp_path: Path):
         output_dir = tmp_path / "output"
         output_dir.mkdir()
 
-        result = await runner.run(hook, inputs, output_dir)
-
-        assert result.status == HookStatus.FAILED
-        assert "exit" in (result.error_message or "").lower()
+        with pytest.raises(PermanentError, match="[Ee]xit"):
+            await runner.run(hook, inputs, output_dir)
 
     @pytest.mark.asyncio
-    async def test_oom_killed_returns_oom(self, tmp_path: Path):
+    async def test_oom_killed_raises_oom_error(self, tmp_path: Path):
         docker = AsyncMock()
         container = AsyncMock()
         docker.containers.create.return_value = container
@@ -256,13 +255,11 @@ async def test_oom_killed_returns_oom(self, tmp_path: Path):
         output_dir = tmp_path / "output"
         output_dir.mkdir()
 
-        result = await runner.run(hook, inputs, output_dir)
-
-        assert result.status == HookStatus.OOM
-        assert "oom" in (result.error_message or "").lower()
+        with pytest.raises(OOMError, match="[Oo][Oo][Mm]"):
+            await runner.run(hook, inputs, output_dir)
 
     @pytest.mark.asyncio
-    async def test_timeout_returns_failed(self, tmp_path: Path):
+    async def test_timeout_raises_infrastructure_error(self, tmp_path: Path):
         import asyncio
 
         docker = AsyncMock()
@@ -286,10 +283,8 @@ async def hang():
         output_dir = tmp_path / "output"
         output_dir.mkdir()
 
-        result = await runner.run(hook, inputs, output_dir)
-
-        assert result.status == HookStatus.FAILED
-        assert "timed out" in (result.error_message or "").lower()
+        with pytest.raises(TransientError, match="[Tt]imed out"):
+            await runner.run(hook, inputs, output_dir)
 
     @pytest.mark.asyncio
     async def test_rejection_via_progress(self, tmp_path: Path):
@@ -474,7 +469,5 @@ async def test_container_deleted_on_failure(self, tmp_path: Path):
         output_dir = tmp_path / "output"
         output_dir.mkdir()
 
-        result = await runner.run(hook, inputs, output_dir)
-
-        assert result.status == HookStatus.FAILED
-        container.delete.assert_called_once_with(force=True)
+        with pytest.raises(TransientError, match="Docker error"):
+            await runner.run(hook, inputs, output_dir)

From 660fe3bd3ed2882edfbacfdc90f5d496d909dcdf Mon Sep 17 00:00:00 2001
From: Rory Byrne <rory@rory.bio>
Date: Sun, 5 Apr 2026 20:40:32 +0100
Subject: [PATCH 02/13] refactor: replace ingest run SRN with simple ID
 throughout system

Change ingest run identifier from SRN format to simple string ID to
simplify the data model and reduce coupling to SRN conventions.
Remove foreign key constraint from ingest_runs table to conventions
table to allow more flexible data management.
---
 server/migrations/versions/add_ingest_runs.py |  3 +-
 .../feature/handler/insert_batch_features.py  |  8 ++---
 .../osa/domain/ingest/command/start_ingest.py |  8 ++++-
 server/osa/domain/ingest/event/events.py      | 15 ++++----
 .../domain/ingest/handler/publish_batch.py    | 28 +++++++--------
 server/osa/domain/ingest/handler/run_hooks.py | 30 ++++++++--------
 .../osa/domain/ingest/handler/run_ingester.py | 36 +++++++++----------
 server/osa/domain/ingest/model/ingest_run.py  |  5 ++-
 server/osa/domain/ingest/port/repository.py   | 12 +++----
 server/osa/domain/ingest/port/storage.py      | 16 ++++-----
 server/osa/domain/ingest/service/ingest.py    | 23 ++++++------
 server/osa/domain/shared/model/source.py      |  2 +-
 server/osa/infrastructure/k8s/runner.py       | 11 ++++--
 .../persistence/adapter/ingest_storage.py     | 32 ++++++++---------
 .../persistence/repository/ingest.py          | 28 +++++++--------
 .../osa/infrastructure/persistence/tables.py  |  4 +--
 .../osa/infrastructure/s3/ingest_storage.py   | 32 ++++++++---------
 server/osa/infrastructure/storage/layout.py   | 25 ++++++-------
 .../feature/test_insert_record_features.py    |  2 +-
 .../unit/domain/ingest/test_ingest_run.py     |  2 +-
 .../unit/domain/ingest/test_ingest_service.py |  4 +--
 .../unit/domain/record/test_record_service.py |  2 +-
 .../unit/domain/shared/test_record_source.py  | 12 +++----
 .../k8s/test_k8s_hook_runner.py               |  6 ++--
 24 files changed, 176 insertions(+), 170 deletions(-)

diff --git a/server/migrations/versions/add_ingest_runs.py b/server/migrations/versions/add_ingest_runs.py
index 6a23b7c..d4dedf5 100644
--- a/server/migrations/versions/add_ingest_runs.py
+++ b/server/migrations/versions/add_ingest_runs.py
@@ -23,11 +23,10 @@
 def upgrade() -> None:
     op.create_table(
         "ingest_runs",
-        sa.Column("srn", sa.String(), primary_key=True),
+        sa.Column("id", sa.String(), primary_key=True),
         sa.Column(
             "convention_srn",
             sa.String(),
-            sa.ForeignKey("conventions.srn"),
             nullable=False,
         ),
         sa.Column(
diff --git a/server/osa/domain/feature/handler/insert_batch_features.py b/server/osa/domain/feature/handler/insert_batch_features.py
index cf7f366..39896d7 100644
--- a/server/osa/domain/feature/handler/insert_batch_features.py
+++ b/server/osa/domain/feature/handler/insert_batch_features.py
@@ -26,9 +26,7 @@ async def handle(self, event: IngestBatchPublished) -> None:
         if not event.expected_features or not event.published_srns:
             return
 
-        batch_output_dir = str(
-            self.layout.ingest_batch_dir(event.ingest_run_srn, event.batch_index)
-        )
+        batch_output_dir = str(self.layout.ingest_batch_dir(event.ingest_run_id, event.batch_index))
 
         total_inserted = 0
         skipped_dupes = 0
@@ -59,7 +57,7 @@ async def handle(self, event: IngestBatchPublished) -> None:
                 )
                 total_inserted += count
 
-        short_id = event.ingest_run_srn.rsplit(":", 1)[-1][:8]
+        short_id = event.ingest_run_id[:8]
         dupe_msg = f", {skipped_dupes} duplicates skipped" if skipped_dupes else ""
         log.info(
             "[{short_id}] batch {batch_index}: inserted {total_inserted} feature rows ({hook_count} hooks{dupe_msg})",
@@ -68,5 +66,5 @@ async def handle(self, event: IngestBatchPublished) -> None:
             total_inserted=total_inserted,
             hook_count=len(event.expected_features),
             dupe_msg=dupe_msg,
-            ingest_run_srn=event.ingest_run_srn,
+            ingest_run_id=event.ingest_run_id,
         )
diff --git a/server/osa/domain/ingest/command/start_ingest.py b/server/osa/domain/ingest/command/start_ingest.py
index 1b65bbd..ddc9716 100644
--- a/server/osa/domain/ingest/command/start_ingest.py
+++ b/server/osa/domain/ingest/command/start_ingest.py
@@ -34,13 +34,19 @@ class StartIngestHandler(CommandHandler[StartIngest, IngestRunCreated]):
     service: IngestService
 
     async def run(self, cmd: StartIngest) -> IngestRunCreated:
+        from osa.domain.shared.model.srn import Domain
+
         ingest_run = await self.service.start_ingest(
             convention_srn=cmd.convention_srn,
             batch_size=cmd.batch_size,
             limit=cmd.limit,
         )
+
+        node_domain: Domain = self.service.node_domain
+        srn = f"urn:osa:{node_domain.root}:ing:{ingest_run.id}"
+
         return IngestRunCreated(
-            srn=ingest_run.srn,
+            srn=srn,
             convention_srn=ingest_run.convention_srn,
             status=ingest_run.status,
             started_at=ingest_run.started_at.isoformat(),
diff --git a/server/osa/domain/ingest/event/events.py b/server/osa/domain/ingest/event/events.py
index 7b14e51..49e74f5 100644
--- a/server/osa/domain/ingest/event/events.py
+++ b/server/osa/domain/ingest/event/events.py
@@ -1,5 +1,6 @@
 """Ingest domain events — payloads carry path references, not inline data (AD-1)."""
 
+from osa.domain.ingest.model.ingest_run import IngestRunId
 from osa.domain.shared.event import Event, EventId
 
 
@@ -7,7 +8,7 @@ class IngestRunStarted(Event):
     """Emitted once when an ingest run is created. Observability/audit only."""
 
     id: EventId
-    ingest_run_srn: str
+    ingest_run_id: IngestRunId
     convention_srn: str
     batch_size: int
 
@@ -20,7 +21,7 @@ class NextBatchRequested(Event):
     """
 
     id: EventId
-    ingest_run_srn: str
+    ingest_run_id: IngestRunId
     convention_srn: str
     batch_size: int
 
@@ -28,11 +29,11 @@ class NextBatchRequested(Event):
 class IngesterBatchReady(Event):
     """Emitted when an ingester container produces a batch of records.
 
-    Batch data is on disk at the path derived from {ingest_run_srn, batch_index}.
+    Batch data is on disk at the path derived from {ingest_run_id, batch_index}.
     """
 
     id: EventId
-    ingest_run_srn: str
+    ingest_run_id: IngestRunId
     batch_index: int
     has_more: bool
 
@@ -44,7 +45,7 @@ class HookBatchCompleted(Event):
     """
 
     id: EventId
-    ingest_run_srn: str
+    ingest_run_id: IngestRunId
     batch_index: int
 
 
@@ -56,7 +57,7 @@ class IngestBatchPublished(Event):
     """
 
     id: EventId
-    ingest_run_srn: str
+    ingest_run_id: IngestRunId
     convention_srn: str
     batch_index: int
     published_srns: list[str]
@@ -69,5 +70,5 @@ class IngestCompleted(Event):
     """Emitted when all batches are processed and the ingest run is complete."""
 
     id: EventId
-    ingest_run_srn: str
+    ingest_run_id: IngestRunId
     total_published: int
diff --git a/server/osa/domain/ingest/handler/publish_batch.py b/server/osa/domain/ingest/handler/publish_batch.py
index 40b94d7..f5b7086 100644
--- a/server/osa/domain/ingest/handler/publish_batch.py
+++ b/server/osa/domain/ingest/handler/publish_batch.py
@@ -37,22 +37,20 @@ class PublishBatch(EventHandler[HookBatchCompleted]):
     ingest_storage: IngestStoragePort
 
     async def handle(self, event: HookBatchCompleted) -> None:
-        ingest_run = await self.ingest_repo.get(event.ingest_run_srn)
+        ingest_run = await self.ingest_repo.get(event.ingest_run_id)
         if ingest_run is None:
-            raise NotFoundError(f"Ingest run not found: {event.ingest_run_srn}")
+            raise NotFoundError(f"Ingest run not found: {event.ingest_run_id}")
 
         convention = await self.convention_service.get_convention(
             ConventionSRN.parse(ingest_run.convention_srn)
         )
 
         # Read ingester records via storage port (filesystem or S3)
-        raw_records = await self.ingest_storage.read_records(
-            event.ingest_run_srn, event.batch_index
-        )
+        raw_records = await self.ingest_storage.read_records(event.ingest_run_id, event.batch_index)
         ingester_records = IngesterRecord.from_dicts(raw_records)
 
         # batch_dir used as locator for hook outcome reads
-        batch_dir = str(self.ingest_storage.batch_dir(event.ingest_run_srn, event.batch_index))
+        batch_dir = str(self.ingest_storage.batch_dir(event.ingest_run_id, event.batch_index))
 
         # Read hook outcomes for all hooks
         expected_features = [h.name for h in convention.hooks]
@@ -67,7 +65,7 @@ async def handle(self, event: HookBatchCompleted) -> None:
         )
 
         # Log outcome breakdown per hook
-        short_id = event.ingest_run_srn.rsplit(":", 1)[-1][:8]
+        short_id = event.ingest_run_id[:8]
         total = len(ingester_records)
         for hook_name in expected_features:
             outcomes = await self.feature_storage.read_batch_outcomes(str(batch_dir), hook_name)
@@ -88,7 +86,7 @@ async def handle(self, event: HookBatchCompleted) -> None:
                 rejected=rejected,
                 errored=errored,
                 missing=missing,
-                ingest_run_srn=event.ingest_run_srn,
+                ingest_run_id=event.ingest_run_id,
             )
 
         published_count = 0
@@ -100,7 +98,7 @@ async def handle(self, event: HookBatchCompleted) -> None:
                     RecordDraft(
                         source=IngestSource(
                             id=f"{ingest_run.convention_srn}:{record.source_id}",
-                            ingest_run_srn=ingest_run.srn,
+                            ingest_run_id=ingest_run.id,
                             upstream_source=record.source_id,
                         ),
                         metadata=record.metadata,
@@ -133,7 +131,7 @@ async def handle(self, event: HookBatchCompleted) -> None:
                 published=published_count,
                 passed=len(passed_records),
                 duplicates=len(drafts) - published_count,
-                ingest_run_srn=event.ingest_run_srn,
+                ingest_run_id=event.ingest_run_id,
             )
 
             # Emit IngestBatchPublished for feature insertion
@@ -141,7 +139,7 @@ async def handle(self, event: HookBatchCompleted) -> None:
                 await self.outbox.append(
                     IngestBatchPublished(
                         id=EventId(uuid4()),
-                        ingest_run_srn=event.ingest_run_srn,
+                        ingest_run_id=event.ingest_run_id,
                         convention_srn=ingest_run.convention_srn,
                         batch_index=event.batch_index,
                         published_srns=published_srns,
@@ -153,7 +151,7 @@ async def handle(self, event: HookBatchCompleted) -> None:
 
         # Update counters atomically
         updated = await self.ingest_repo.increment_completed(
-            event.ingest_run_srn,
+            event.ingest_run_id,
             published_count=published_count,
         )
 
@@ -165,16 +163,16 @@ async def handle(self, event: HookBatchCompleted) -> None:
             await self.outbox.append(
                 IngestCompleted(
                     id=EventId(uuid4()),
-                    ingest_run_srn=event.ingest_run_srn,
+                    ingest_run_id=event.ingest_run_id,
                     total_published=updated.published_count,
                 )
             )
-            short_id = event.ingest_run_srn.rsplit(":", 1)[-1][:8]
+            short_id = event.ingest_run_id[:8]
             log.info(
                 "[{short_id}] COMPLETE: {total_published} records published",
                 short_id=short_id,
                 total_published=updated.published_count,
-                ingest_run_srn=event.ingest_run_srn,
+                ingest_run_id=event.ingest_run_id,
             )
 
 
diff --git a/server/osa/domain/ingest/handler/run_hooks.py b/server/osa/domain/ingest/handler/run_hooks.py
index 18f82de..5023a35 100644
--- a/server/osa/domain/ingest/handler/run_hooks.py
+++ b/server/osa/domain/ingest/handler/run_hooks.py
@@ -34,29 +34,27 @@ class RunHooks(EventHandler[IngesterBatchReady]):
     ingest_storage: IngestStoragePort
 
     async def handle(self, event: IngesterBatchReady) -> None:
-        ingest_run = await self.ingest_repo.get(event.ingest_run_srn)
+        ingest_run = await self.ingest_repo.get(event.ingest_run_id)
         if ingest_run is None:
-            raise NotFoundError(f"Ingest run not found: {event.ingest_run_srn}")
+            raise NotFoundError(f"Ingest run not found: {event.ingest_run_id}")
 
         convention = await self.convention_service.get_convention(
             ConventionSRN.parse(ingest_run.convention_srn)
         )
 
         # Read records via storage port (filesystem or S3)
-        raw_records = await self.ingest_storage.read_records(
-            event.ingest_run_srn, event.batch_index
-        )
+        raw_records = await self.ingest_storage.read_records(event.ingest_run_id, event.batch_index)
         records = IngesterRecord.from_dicts(raw_records)
 
         if not records:
             log.warn(
                 "ingest batch {batch_index}: no records to process",
                 batch_index=event.batch_index,
-                ingest_run_srn=event.ingest_run_srn,
+                ingest_run_id=event.ingest_run_id,
             )
 
         # Build files_dirs from ingester files (Path locators for runner volume mounts)
-        files_base = self.ingest_storage.batch_files_dir(event.ingest_run_srn, event.batch_index)
+        files_base = self.ingest_storage.batch_files_dir(event.ingest_run_id, event.batch_index)
         files_dirs: dict[str, Path] = {}
         for record in records:
             if record.files:
@@ -72,7 +70,7 @@ async def handle(self, event: IngesterBatchReady) -> None:
                 )
                 for r in records
             ],
-            run_id=f"{event.ingest_run_srn}_batch{event.batch_index}",
+            run_id=f"{event.ingest_run_id}_b{event.batch_index}",
             files_dirs=files_dirs,
         )
 
@@ -80,7 +78,7 @@ async def handle(self, event: IngesterBatchReady) -> None:
         work_dirs: dict[str, Path] = {}
         for hook in convention.hooks:
             work_dirs[hook.name] = self.ingest_storage.hook_work_dir(
-                event.ingest_run_srn, event.batch_index, hook.name
+                event.ingest_run_id, event.batch_index, hook.name
             )
 
         # Run all hooks via HookService
@@ -93,15 +91,15 @@ async def handle(self, event: IngesterBatchReady) -> None:
         except PermanentError as e:
             log.error(
                 "[{short_id}] batch {batch_index} permanently failed: {error}",
-                short_id=event.ingest_run_srn.rsplit(":", 1)[-1][:8],
+                short_id=event.ingest_run_id[:8],
                 batch_index=event.batch_index,
                 error=str(e),
-                ingest_run_srn=event.ingest_run_srn,
+                ingest_run_id=event.ingest_run_id,
             )
             await self._fail_batch(event)
             return
 
-        short_id = event.ingest_run_srn.rsplit(":", 1)[-1][:8]
+        short_id = event.ingest_run_id[:8]
         for result in results:
             log.info(
                 "[{short_id}] batch {batch_index} hook={hook_name}: {status} in {duration:.1f}s",
@@ -110,14 +108,14 @@ async def handle(self, event: IngesterBatchReady) -> None:
                 hook_name=result.hook_name,
                 status=result.status.value,
                 duration=result.duration_seconds,
-                ingest_run_srn=event.ingest_run_srn,
+                ingest_run_id=event.ingest_run_id,
             )
 
         # Emit HookBatchCompleted
         await self.outbox.append(
             HookBatchCompleted(
                 id=EventId(uuid4()),
-                ingest_run_srn=event.ingest_run_srn,
+                ingest_run_id=event.ingest_run_id,
                 batch_index=event.batch_index,
             )
         )
@@ -127,10 +125,10 @@ async def on_exhausted(self, event: IngesterBatchReady) -> None:
         log.error(
             "batch {batch_index} retries exhausted",
             batch_index=event.batch_index,
-            ingest_run_srn=event.ingest_run_srn,
+            ingest_run_id=event.ingest_run_id,
         )
         await self._fail_batch(event)
 
     async def _fail_batch(self, event: IngesterBatchReady) -> None:
         """Account for a permanently failed batch."""
-        await self.ingest_service.fail_batch(event.ingest_run_srn)
+        await self.ingest_service.fail_batch(event.ingest_run_id)
diff --git a/server/osa/domain/ingest/handler/run_ingester.py b/server/osa/domain/ingest/handler/run_ingester.py
index a7048b7..a9d397d 100644
--- a/server/osa/domain/ingest/handler/run_ingester.py
+++ b/server/osa/domain/ingest/handler/run_ingester.py
@@ -31,9 +31,9 @@ class RunIngester(EventHandler[NextBatchRequested]):
     ingest_storage: IngestStoragePort
 
     async def handle(self, event: NextBatchRequested) -> None:
-        ingest_run = await self.ingest_repo.get(event.ingest_run_srn)
+        ingest_run = await self.ingest_repo.get(event.ingest_run_id)
         if ingest_run is None:
-            raise NotFoundError(f"Ingest run not found: {event.ingest_run_srn}")
+            raise NotFoundError(f"Ingest run not found: {event.ingest_run_id}")
 
         if ingest_run.status == IngestStatus.PENDING:
             ingest_run.mark_running()
@@ -47,7 +47,7 @@ async def handle(self, event: NextBatchRequested) -> None:
 
         batch_index = ingest_run.batches_ingested
 
-        session = await self.ingest_storage.read_session(event.ingest_run_srn)
+        session = await self.ingest_storage.read_session(event.ingest_run_id)
 
         effective_batch_limit = ingest_run.batch_size
         if ingest_run.limit is not None:
@@ -58,10 +58,10 @@ async def handle(self, event: NextBatchRequested) -> None:
                     "Ignoring redelivered NextBatchRequested — limit already met (batches_ingested={batches_ingested}, limit={limit})",
                     batches_ingested=ingest_run.batches_ingested,
                     limit=ingest_run.limit,
-                    ingest_run_srn=event.ingest_run_srn,
+                    ingest_run_id=event.ingest_run_id,
                 )
                 await self.ingest_repo.increment_batches_ingested(
-                    event.ingest_run_srn,
+                    event.ingest_run_id,
                     set_ingestion_finished=True,
                 )
                 return
@@ -73,8 +73,8 @@ async def handle(self, event: NextBatchRequested) -> None:
             limit=effective_batch_limit,
             session=session,
         )
-        work_dir = self.ingest_storage.batch_work_dir(event.ingest_run_srn, batch_index)
-        files_dir = self.ingest_storage.batch_files_dir(event.ingest_run_srn, batch_index)
+        work_dir = self.ingest_storage.batch_work_dir(event.ingest_run_id, batch_index)
+        files_dir = self.ingest_storage.batch_files_dir(event.ingest_run_id, batch_index)
 
         try:
             output = await self.ingester_runner.run(
@@ -86,17 +86,17 @@ async def handle(self, event: NextBatchRequested) -> None:
         except PermanentError as e:
             log.error(
                 "[{short_id}] ingester permanently failed: {error}",
-                short_id=event.ingest_run_srn.rsplit(":", 1)[-1][:8],
+                short_id=event.ingest_run_id[:8],
                 error=str(e),
-                ingest_run_srn=event.ingest_run_srn,
+                ingest_run_id=event.ingest_run_id,
             )
             await self._fail_ingestion(event)
             return
 
-        await self.ingest_storage.write_records(event.ingest_run_srn, batch_index, output.records)
+        await self.ingest_storage.write_records(event.ingest_run_id, batch_index, output.records)
 
         if output.session:
-            await self.ingest_storage.write_session(event.ingest_run_srn, output.session)
+            await self.ingest_storage.write_session(event.ingest_run_id, output.session)
 
         has_more = output.session is not None and len(output.records) > 0
 
@@ -106,34 +106,34 @@ async def handle(self, event: NextBatchRequested) -> None:
                 has_more = False
 
         await self.ingest_repo.increment_batches_ingested(
-            event.ingest_run_srn,
+            event.ingest_run_id,
             set_ingestion_finished=not has_more,
         )
 
         await self.outbox.append(
             IngesterBatchReady(
                 id=EventId(uuid4()),
-                ingest_run_srn=event.ingest_run_srn,
+                ingest_run_id=event.ingest_run_id,
                 batch_index=batch_index,
                 has_more=has_more,
             )
         )
 
-        short_id = event.ingest_run_srn.rsplit(":", 1)[-1][:8]
+        short_id = event.ingest_run_id[:8]
         log.info(
             "[{short_id}] batch {batch_index}: pulled {record_count} records (has_more={has_more})",
             short_id=short_id,
             batch_index=batch_index,
             record_count=len(output.records),
             has_more=has_more,
-            ingest_run_srn=event.ingest_run_srn,
+            ingest_run_id=event.ingest_run_id,
         )
 
         if has_more:
             await self.outbox.append(
                 NextBatchRequested(
                     id=EventId(uuid4()),
-                    ingest_run_srn=event.ingest_run_srn,
+                    ingest_run_id=event.ingest_run_id,
                     convention_srn=event.convention_srn,
                     batch_size=ingest_run.batch_size,
                 )
@@ -143,10 +143,10 @@ async def on_exhausted(self, event: NextBatchRequested) -> None:
         """Transient retries exhausted — stop ingestion and check completion."""
         log.error(
             "ingester retries exhausted",
-            ingest_run_srn=event.ingest_run_srn,
+            ingest_run_id=event.ingest_run_id,
         )
         await self._fail_ingestion(event)
 
     async def _fail_ingestion(self, event: NextBatchRequested) -> None:
         """Account for a permanently failed ingester pull."""
-        await self.ingest_service.fail_ingestion(event.ingest_run_srn)
+        await self.ingest_service.fail_ingestion(event.ingest_run_id)
diff --git a/server/osa/domain/ingest/model/ingest_run.py b/server/osa/domain/ingest/model/ingest_run.py
index cf72f77..53a899b 100644
--- a/server/osa/domain/ingest/model/ingest_run.py
+++ b/server/osa/domain/ingest/model/ingest_run.py
@@ -2,10 +2,13 @@
 
 from datetime import datetime
 from enum import StrEnum
+from typing import NewType
 
 from osa.domain.shared.error import InvalidStateError
 from osa.domain.shared.model.aggregate import Aggregate
 
+IngestRunId = NewType("IngestRunId", str)
+
 
 class IngestStatus(StrEnum):
     PENDING = "pending"
@@ -29,7 +32,7 @@ class IngestRun(Aggregate):
     Counter updates use atomic SQL increments in the repository.
     """
 
-    srn: str
+    id: IngestRunId
     convention_srn: str
     status: IngestStatus = IngestStatus.PENDING
     ingestion_finished: bool = False
diff --git a/server/osa/domain/ingest/port/repository.py b/server/osa/domain/ingest/port/repository.py
index de27dc7..3b6a376 100644
--- a/server/osa/domain/ingest/port/repository.py
+++ b/server/osa/domain/ingest/port/repository.py
@@ -3,7 +3,7 @@
 from abc import abstractmethod
 from typing import Protocol
 
-from osa.domain.ingest.model.ingest_run import IngestRun
+from osa.domain.ingest.model.ingest_run import IngestRun, IngestRunId
 from osa.domain.shared.port import Port
 
 
@@ -21,8 +21,8 @@ async def save(self, ingest_run: IngestRun) -> None:
         ...
 
     @abstractmethod
-    async def get(self, srn: str) -> IngestRun | None:
-        """Get an ingest run by SRN."""
+    async def get(self, id: IngestRunId) -> IngestRun | None:
+        """Get an ingest run by ID."""
         ...
 
     @abstractmethod
@@ -32,7 +32,7 @@ async def get_running_for_convention(self, convention_srn: str) -> IngestRun | N
 
     @abstractmethod
     async def increment_batches_ingested(
-        self, srn: str, *, set_ingestion_finished: bool = False
+        self, id: IngestRunId, *, set_ingestion_finished: bool = False
     ) -> IngestRun:
         """Atomically increment batches_ingested and optionally set ingestion_finished.
 
@@ -41,7 +41,7 @@ async def increment_batches_ingested(
         ...
 
     @abstractmethod
-    async def increment_failed(self, srn: str) -> IngestRun:
+    async def increment_failed(self, id: IngestRunId) -> IngestRun:
         """Atomically increment batches_failed.
 
         Returns the updated IngestRun for completion condition checking.
@@ -49,7 +49,7 @@ async def increment_failed(self, srn: str) -> IngestRun:
         ...
 
     @abstractmethod
-    async def increment_completed(self, srn: str, published_count: int) -> IngestRun:
+    async def increment_completed(self, id: IngestRunId, published_count: int) -> IngestRun:
         """Atomically increment batches_completed and published_count.
 
         Returns the updated IngestRun with DB-authoritative counter values
diff --git a/server/osa/domain/ingest/port/storage.py b/server/osa/domain/ingest/port/storage.py
index e66a5d0..3099c46 100644
--- a/server/osa/domain/ingest/port/storage.py
+++ b/server/osa/domain/ingest/port/storage.py
@@ -20,43 +20,43 @@ class IngestStoragePort(Port, Protocol):
     """
 
     @abstractmethod
-    async def read_session(self, ingest_run_srn: str) -> dict[str, Any] | None:
+    async def read_session(self, ingest_run_id: str) -> dict[str, Any] | None:
         """Read session state for ingester continuation. Returns None if no session."""
         ...
 
     @abstractmethod
-    async def write_session(self, ingest_run_srn: str, session: dict[str, Any]) -> None:
+    async def write_session(self, ingest_run_id: str, session: dict[str, Any]) -> None:
         """Persist session state between batches."""
         ...
 
     @abstractmethod
     async def write_records(
-        self, ingest_run_srn: str, batch_index: int, records: list[dict[str, Any]]
+        self, ingest_run_id: str, batch_index: int, records: list[dict[str, Any]]
     ) -> None:
         """Write ingester output records for a batch as JSONL."""
         ...
 
     @abstractmethod
-    async def read_records(self, ingest_run_srn: str, batch_index: int) -> list[dict[str, Any]]:
+    async def read_records(self, ingest_run_id: str, batch_index: int) -> list[dict[str, Any]]:
         """Read raw ingester output records for a batch."""
         ...
 
     @abstractmethod
-    def batch_dir(self, ingest_run_srn: str, batch_index: int) -> Path:
+    def batch_dir(self, ingest_run_id: str, batch_index: int) -> Path:
         """Return the batch-level directory (parent of ingester/ and hooks/)."""
         ...
 
     @abstractmethod
-    def batch_work_dir(self, ingest_run_srn: str, batch_index: int) -> Path:
+    def batch_work_dir(self, ingest_run_id: str, batch_index: int) -> Path:
         """Return the ingester work directory for a batch."""
         ...
 
     @abstractmethod
-    def batch_files_dir(self, ingest_run_srn: str, batch_index: int) -> Path:
+    def batch_files_dir(self, ingest_run_id: str, batch_index: int) -> Path:
         """Return the files directory for a batch."""
         ...
 
     @abstractmethod
-    def hook_work_dir(self, ingest_run_srn: str, batch_index: int, hook_name: str) -> Path:
+    def hook_work_dir(self, ingest_run_id: str, batch_index: int, hook_name: str) -> Path:
         """Return the hook output directory for a batch."""
         ...
diff --git a/server/osa/domain/ingest/service/ingest.py b/server/osa/domain/ingest/service/ingest.py
index eb8ec1b..f009d77 100644
--- a/server/osa/domain/ingest/service/ingest.py
+++ b/server/osa/domain/ingest/service/ingest.py
@@ -5,7 +5,7 @@
 
 from osa.domain.deposition.service.convention import ConventionService
 from osa.domain.ingest.event.events import IngestCompleted, IngestRunStarted, NextBatchRequested
-from osa.domain.ingest.model.ingest_run import IngestRun, IngestStatus
+from osa.domain.ingest.model.ingest_run import IngestRun, IngestRunId, IngestStatus
 from osa.domain.ingest.port.repository import IngestRunRepository
 from osa.domain.shared.error import ConflictError, NotFoundError
 from osa.domain.shared.event import EventId
@@ -54,11 +54,11 @@ async def start_ingest(
                 code="ingest_already_running",
             )
 
-        srn = f"urn:osa:{self.node_domain.root}:ing:{uuid4()}"
+        run_id = IngestRunId(str(uuid4()))
         now = datetime.now(UTC)
 
         ingest_run = IngestRun(
-            srn=srn,
+            id=run_id,
             convention_srn=convention_srn,
             status=IngestStatus.PENDING,
             batch_size=batch_size,
@@ -71,7 +71,7 @@ async def start_ingest(
         await self.outbox.append(
             IngestRunStarted(
                 id=EventId(uuid4()),
-                ingest_run_srn=srn,
+                ingest_run_id=run_id,
                 convention_srn=convention_srn,
                 batch_size=batch_size,
             )
@@ -80,12 +80,13 @@ async def start_ingest(
         await self.outbox.append(
             NextBatchRequested(
                 id=EventId(uuid4()),
-                ingest_run_srn=srn,
+                ingest_run_id=run_id,
                 convention_srn=convention_srn,
                 batch_size=batch_size,
             )
         )
 
+        srn = f"urn:osa:{self.node_domain.root}:ing:{run_id}"
         log.info(
             "ingest started for {convention_srn}",
             ingest_run_srn=srn,
@@ -95,16 +96,16 @@ async def start_ingest(
         )
         return ingest_run
 
-    async def fail_batch(self, ingest_run_srn: str) -> None:
+    async def fail_batch(self, ingest_run_id: IngestRunId) -> None:
         """Account for a batch that permanently failed hook processing.
 
         Increments batches_failed and completes the run if all batches
         are now accounted for (completed + failed >= ingested).
         """
-        ingest_run = await self.ingest_repo.increment_failed(ingest_run_srn)
+        ingest_run = await self.ingest_repo.increment_failed(ingest_run_id)
         await self._check_completion(ingest_run)
 
-    async def fail_ingestion(self, ingest_run_srn: str) -> None:
+    async def fail_ingestion(self, ingest_run_id: IngestRunId) -> None:
         """Account for a failed ingester pull.
 
         The batch was never sourced, so we mark ingestion as finished
@@ -113,10 +114,10 @@ async def fail_ingestion(self, ingest_run_srn: str) -> None:
         were already sourced.
         """
         await self.ingest_repo.increment_batches_ingested(
-            ingest_run_srn,
+            ingest_run_id,
             set_ingestion_finished=True,
         )
-        ingest_run = await self.ingest_repo.increment_failed(ingest_run_srn)
+        ingest_run = await self.ingest_repo.increment_failed(ingest_run_id)
         await self._check_completion(ingest_run)
 
     async def _check_completion(self, ingest_run: IngestRun) -> None:
@@ -129,7 +130,7 @@ async def _check_completion(self, ingest_run: IngestRun) -> None:
         await self.outbox.append(
             IngestCompleted(
                 id=EventId(uuid4()),
-                ingest_run_srn=ingest_run.srn,
+                ingest_run_id=ingest_run.id,
                 total_published=ingest_run.published_count,
             )
         )
diff --git a/server/osa/domain/shared/model/source.py b/server/osa/domain/shared/model/source.py
index 9d4b1ef..47de498 100644
--- a/server/osa/domain/shared/model/source.py
+++ b/server/osa/domain/shared/model/source.py
@@ -55,7 +55,7 @@ class IngestSource(_RecordSourceBase):
     """Record originated from an automated ingest run."""
 
     type: Literal["ingest"] = "ingest"
-    ingest_run_srn: str
+    ingest_run_id: str
     upstream_source: str
 
 
diff --git a/server/osa/infrastructure/k8s/runner.py b/server/osa/infrastructure/k8s/runner.py
index 46ff230..4568ca8 100644
--- a/server/osa/infrastructure/k8s/runner.py
+++ b/server/osa/infrastructure/k8s/runner.py
@@ -20,7 +20,7 @@
 from osa.domain.validation.model.hook_result import HookResult, HookStatus
 from osa.domain.validation.port.hook_runner import HookInputs, HookRunner
 from osa.infrastructure.k8s.errors import classify_api_error
-from osa.infrastructure.k8s.naming import job_name, label_value
+from osa.infrastructure.k8s.naming import job_name
 from osa.infrastructure.runner_utils import (
     detect_rejection,
     relative_path,
@@ -183,7 +183,9 @@ async def _check_existing_job(
             "active:{job_name}" if a running Job exists
             None if no Job or only failed Jobs exist
         """
-        label_selector = f"osa.io/hook={hook_name},osa.io/run-id={label_value(run_id)}"
+        ingest_run_id = run_id.split("_b", 1)[0]
+        batch_index = run_id.split("_b", 1)[1] if "_b" in run_id else "0"
+        label_selector = f"osa.io/hook={hook_name},osa.io/ingest-run-id={ingest_run_id},osa.io/ingest-run-batch={batch_index}"
         try:
             job_list = await self._batch_api.list_namespaced_job(
                 namespace, label_selector=label_selector
@@ -234,10 +236,13 @@ def _build_job_spec(
         input_subpath = f"{relative_work}/input"
         output_subpath = f"{relative_work}/output"
 
+        ingest_run_id = run_id.split("_b", 1)[0]
+        batch_index = run_id.split("_b", 1)[1] if "_b" in run_id else "0"
         labels = {
             "osa.io/role": "hook",
             "osa.io/hook": hook.name,
-            "osa.io/run-id": label_value(run_id),
+            "osa.io/ingest-run-id": ingest_run_id,
+            "osa.io/ingest-run-batch": batch_index,
         }
 
         mounts = [
diff --git a/server/osa/infrastructure/persistence/adapter/ingest_storage.py b/server/osa/infrastructure/persistence/adapter/ingest_storage.py
index 57db091..44bf535 100644
--- a/server/osa/infrastructure/persistence/adapter/ingest_storage.py
+++ b/server/osa/infrastructure/persistence/adapter/ingest_storage.py
@@ -18,14 +18,14 @@ class FilesystemIngestStorage:
     def __init__(self, layout: StorageLayout) -> None:
         self._layout = layout
 
-    async def read_session(self, ingest_run_srn: str) -> dict[str, Any] | None:
-        session_file = self._layout.ingest_session_file(ingest_run_srn)
+    async def read_session(self, ingest_run_id: str) -> dict[str, Any] | None:
+        session_file = self._layout.ingest_session_file(ingest_run_id)
         if not session_file.exists():
             return None
         return json.loads(session_file.read_text())
 
-    async def write_session(self, ingest_run_srn: str, session: dict[str, Any]) -> None:
-        session_file = self._layout.ingest_session_file(ingest_run_srn)
+    async def write_session(self, ingest_run_id: str, session: dict[str, Any]) -> None:
+        session_file = self._layout.ingest_session_file(ingest_run_id)
         session_file.parent.mkdir(parents=True, exist_ok=True)
         # Atomic write via temp file + os.replace to handle mountpoint-for-s3
         tmp = session_file.with_suffix(".tmp")
@@ -33,9 +33,9 @@ async def write_session(self, ingest_run_srn: str, session: dict[str, Any]) -> N
         os.replace(tmp, session_file)
 
     async def write_records(
-        self, ingest_run_srn: str, batch_index: int, records: list[dict[str, Any]]
+        self, ingest_run_id: str, batch_index: int, records: list[dict[str, Any]]
     ) -> None:
-        ingester_dir = self._layout.ingest_batch_ingester_dir(ingest_run_srn, batch_index)
+        ingester_dir = self._layout.ingest_batch_ingester_dir(ingest_run_id, batch_index)
         ingester_dir.mkdir(parents=True, exist_ok=True)
         records_file = ingester_dir / "records.jsonl"
         tmp = records_file.with_suffix(".tmp")
@@ -44,8 +44,8 @@ async def write_records(
                 f.write(json.dumps(record) + "\n")
         os.replace(tmp, records_file)
 
-    async def read_records(self, ingest_run_srn: str, batch_index: int) -> list[dict[str, Any]]:
-        ingester_dir = self._layout.ingest_batch_ingester_dir(ingest_run_srn, batch_index)
+    async def read_records(self, ingest_run_id: str, batch_index: int) -> list[dict[str, Any]]:
+        ingester_dir = self._layout.ingest_batch_ingester_dir(ingest_run_id, batch_index)
         records_file = ingester_dir / "records.jsonl"
         if not records_file.exists():
             return []
@@ -57,22 +57,22 @@ async def read_records(self, ingest_run_srn: str, batch_index: int) -> list[dict
             records.append(json.loads(line))
         return records
 
-    def batch_dir(self, ingest_run_srn: str, batch_index: int) -> Path:
-        d = self._layout.ingest_batch_dir(ingest_run_srn, batch_index)
+    def batch_dir(self, ingest_run_id: str, batch_index: int) -> Path:
+        d = self._layout.ingest_batch_dir(ingest_run_id, batch_index)
         d.mkdir(parents=True, exist_ok=True)
         return d
 
-    def batch_work_dir(self, ingest_run_srn: str, batch_index: int) -> Path:
-        d = self._layout.ingest_batch_ingester_dir(ingest_run_srn, batch_index)
+    def batch_work_dir(self, ingest_run_id: str, batch_index: int) -> Path:
+        d = self._layout.ingest_batch_ingester_dir(ingest_run_id, batch_index)
         d.mkdir(parents=True, exist_ok=True)
         return d
 
-    def batch_files_dir(self, ingest_run_srn: str, batch_index: int) -> Path:
-        d = self._layout.ingest_batch_ingester_dir(ingest_run_srn, batch_index) / "files"
+    def batch_files_dir(self, ingest_run_id: str, batch_index: int) -> Path:
+        d = self._layout.ingest_batch_ingester_dir(ingest_run_id, batch_index) / "files"
         d.mkdir(parents=True, exist_ok=True)
         return d
 
-    def hook_work_dir(self, ingest_run_srn: str, batch_index: int, hook_name: str) -> Path:
-        d = self._layout.ingest_batch_hook_dir(ingest_run_srn, batch_index, hook_name)
+    def hook_work_dir(self, ingest_run_id: str, batch_index: int, hook_name: str) -> Path:
+        d = self._layout.ingest_batch_hook_dir(ingest_run_id, batch_index, hook_name)
         d.mkdir(parents=True, exist_ok=True)
         return d
diff --git a/server/osa/infrastructure/persistence/repository/ingest.py b/server/osa/infrastructure/persistence/repository/ingest.py
index 3c813e6..5d330ac 100644
--- a/server/osa/infrastructure/persistence/repository/ingest.py
+++ b/server/osa/infrastructure/persistence/repository/ingest.py
@@ -22,7 +22,7 @@ def __init__(self, session: AsyncSession) -> None:
     async def save(self, ingest_run: IngestRun) -> None:
         """Insert or update an ingest run."""
         values = {
-            "srn": ingest_run.srn,
+            "id": ingest_run.id,
             "convention_srn": ingest_run.convention_srn,
             "status": ingest_run.status.value,
             "ingestion_finished": ingest_run.ingestion_finished,
@@ -39,15 +39,15 @@ async def save(self, ingest_run: IngestRun) -> None:
             insert(ingest_runs_table)
             .values(**values)
             .on_conflict_do_update(
-                index_elements=["srn"],
+                index_elements=["id"],
                 set_=values,
             )
         )
         await self._session.execute(stmt)
         await self._session.flush()
 
-    async def get(self, srn: str) -> IngestRun | None:
-        stmt = select(ingest_runs_table).where(ingest_runs_table.c.srn == srn)
+    async def get(self, id: str) -> IngestRun | None:
+        stmt = select(ingest_runs_table).where(ingest_runs_table.c.id == id)
         result = await self._session.execute(stmt)
         row = result.mappings().first()
         if row is None:
@@ -72,7 +72,7 @@ async def get_running_for_convention(self, convention_srn: str) -> IngestRun | N
         return _row_to_ingest_run(dict(row))
 
     async def increment_batches_ingested(
-        self, srn: str, *, set_ingestion_finished: bool = False
+        self, id: str, *, set_ingestion_finished: bool = False
     ) -> IngestRun:
         """Atomically increment batches_ingested."""
         t = ingest_runs_table
@@ -82,22 +82,22 @@ async def increment_batches_ingested(
         if set_ingestion_finished:
             values["ingestion_finished"] = True
 
-        stmt = update(t).where(t.c.srn == srn).values(**values).returning(*t.c)
+        stmt = update(t).where(t.c.id == id).values(**values).returning(*t.c)
         result = await self._session.execute(stmt)
         await self._session.flush()
         row = result.mappings().first()
         if row is None:
             from osa.domain.shared.error import NotFoundError
 
-            raise NotFoundError(f"Ingest run not found: {srn}")
+            raise NotFoundError(f"Ingest run not found: {id}")
         return _row_to_ingest_run(dict(row))
 
-    async def increment_failed(self, srn: str) -> IngestRun:
+    async def increment_failed(self, id: str) -> IngestRun:
         """Atomically increment batches_failed."""
         t = ingest_runs_table
         stmt = (
             update(t)
-            .where(t.c.srn == srn)
+            .where(t.c.id == id)
             .values(batches_failed=t.c.batches_failed + 1)
             .returning(*t.c)
         )
@@ -107,15 +107,15 @@ async def increment_failed(self, srn: str) -> IngestRun:
         if row is None:
             from osa.domain.shared.error import NotFoundError
 
-            raise NotFoundError(f"Ingest run not found: {srn}")
+            raise NotFoundError(f"Ingest run not found: {id}")
         return _row_to_ingest_run(dict(row))
 
-    async def increment_completed(self, srn: str, published_count: int) -> IngestRun:
+    async def increment_completed(self, id: str, published_count: int) -> IngestRun:
         """Atomically increment batches_completed and published_count."""
         t = ingest_runs_table
         stmt = (
             update(t)
-            .where(t.c.srn == srn)
+            .where(t.c.id == id)
             .values(
                 batches_completed=t.c.batches_completed + 1,
                 published_count=t.c.published_count + published_count,
@@ -128,13 +128,13 @@ async def increment_completed(self, srn: str, published_count: int) -> IngestRun
         if row is None:
             from osa.domain.shared.error import NotFoundError
 
-            raise NotFoundError(f"Ingest run not found: {srn}")
+            raise NotFoundError(f"Ingest run not found: {id}")
         return _row_to_ingest_run(dict(row))
 
 
 def _row_to_ingest_run(row: dict) -> IngestRun:
     return IngestRun(
-        srn=row["srn"],
+        id=row["id"],
         convention_srn=row["convention_srn"],
         status=IngestStatus(row["status"]),
         ingestion_finished=row["ingestion_finished"],
diff --git a/server/osa/infrastructure/persistence/tables.py b/server/osa/infrastructure/persistence/tables.py
index 7e80d63..315f97c 100644
--- a/server/osa/infrastructure/persistence/tables.py
+++ b/server/osa/infrastructure/persistence/tables.py
@@ -318,8 +318,8 @@
 ingest_runs_table = Table(
     "ingest_runs",
     metadata,
-    Column("srn", String, primary_key=True),
-    Column("convention_srn", String, ForeignKey("conventions.srn"), nullable=False),
+    Column("id", String, primary_key=True),
+    Column("convention_srn", String, nullable=False),
     Column("status", String(32), nullable=False, server_default=text("'pending'")),
     Column("ingestion_finished", Boolean, nullable=False, server_default=text("false")),
     Column("batches_ingested", Integer, nullable=False, server_default=text("0")),
diff --git a/server/osa/infrastructure/s3/ingest_storage.py b/server/osa/infrastructure/s3/ingest_storage.py
index b46645c..12820a4 100644
--- a/server/osa/infrastructure/s3/ingest_storage.py
+++ b/server/osa/infrastructure/s3/ingest_storage.py
@@ -37,8 +37,8 @@ def _key(self, path: Path) -> str:
         """Convert a StorageLayout path to an S3 key."""
         return relative_path(path, self._data_mount_path)
 
-    async def read_session(self, ingest_run_srn: str) -> dict[str, Any] | None:
-        key = self._key(self._layout.ingest_session_file(ingest_run_srn))
+    async def read_session(self, ingest_run_id: str) -> dict[str, Any] | None:
+        key = self._key(self._layout.ingest_session_file(ingest_run_id))
         try:
             data = await self._s3.get_object(key)
             return json.loads(data)
@@ -47,20 +47,20 @@ async def read_session(self, ingest_run_srn: str) -> dict[str, Any] | None:
                 return None
             raise
 
-    async def write_session(self, ingest_run_srn: str, session: dict[str, Any]) -> None:
-        key = self._key(self._layout.ingest_session_file(ingest_run_srn))
+    async def write_session(self, ingest_run_id: str, session: dict[str, Any]) -> None:
+        key = self._key(self._layout.ingest_session_file(ingest_run_id))
         await self._s3.put_object(key, json.dumps(session))
 
     async def write_records(
-        self, ingest_run_srn: str, batch_index: int, records: list[dict[str, Any]]
+        self, ingest_run_id: str, batch_index: int, records: list[dict[str, Any]]
     ) -> None:
-        ingester_dir = self._layout.ingest_batch_ingester_dir(ingest_run_srn, batch_index)
+        ingester_dir = self._layout.ingest_batch_ingester_dir(ingest_run_id, batch_index)
         key = f"{self._key(ingester_dir)}/records.jsonl"
         content = "".join(json.dumps(r) + "\n" for r in records)
         await self._s3.put_object(key, content)
 
-    async def read_records(self, ingest_run_srn: str, batch_index: int) -> list[dict[str, Any]]:
-        ingester_dir = self._layout.ingest_batch_ingester_dir(ingest_run_srn, batch_index)
+    async def read_records(self, ingest_run_id: str, batch_index: int) -> list[dict[str, Any]]:
+        ingester_dir = self._layout.ingest_batch_ingester_dir(ingest_run_id, batch_index)
         key = f"{self._key(ingester_dir)}/records.jsonl"
         try:
             data = await self._s3.get_object(key)
@@ -76,14 +76,14 @@ async def read_records(self, ingest_run_srn: str, batch_index: int) -> list[dict
             records.append(json.loads(line))
         return records
 
-    def batch_dir(self, ingest_run_srn: str, batch_index: int) -> Path:
-        return self._layout.ingest_batch_dir(ingest_run_srn, batch_index)
+    def batch_dir(self, ingest_run_id: str, batch_index: int) -> Path:
+        return self._layout.ingest_batch_dir(ingest_run_id, batch_index)
 
-    def batch_work_dir(self, ingest_run_srn: str, batch_index: int) -> Path:
-        return self._layout.ingest_batch_ingester_dir(ingest_run_srn, batch_index)
+    def batch_work_dir(self, ingest_run_id: str, batch_index: int) -> Path:
+        return self._layout.ingest_batch_ingester_dir(ingest_run_id, batch_index)
 
-    def batch_files_dir(self, ingest_run_srn: str, batch_index: int) -> Path:
-        return self._layout.ingest_batch_ingester_dir(ingest_run_srn, batch_index) / "files"
+    def batch_files_dir(self, ingest_run_id: str, batch_index: int) -> Path:
+        return self._layout.ingest_batch_ingester_dir(ingest_run_id, batch_index) / "files"
 
-    def hook_work_dir(self, ingest_run_srn: str, batch_index: int, hook_name: str) -> Path:
-        return self._layout.ingest_batch_hook_dir(ingest_run_srn, batch_index, hook_name)
+    def hook_work_dir(self, ingest_run_id: str, batch_index: int, hook_name: str) -> Path:
+        return self._layout.ingest_batch_hook_dir(ingest_run_id, batch_index, hook_name)
diff --git a/server/osa/infrastructure/storage/layout.py b/server/osa/infrastructure/storage/layout.py
index a00e734..32572a1 100644
--- a/server/osa/infrastructure/storage/layout.py
+++ b/server/osa/infrastructure/storage/layout.py
@@ -10,11 +10,6 @@
 from pathlib import Path
 
 
-def _safe_srn(srn: str) -> str:
-    """Convert an SRN to a filesystem-safe string."""
-    return srn.replace(":", "_").replace("@", "_")
-
-
 class StorageLayout:
     """Computes storage paths relative to a data root.
 
@@ -27,22 +22,22 @@ def __init__(self, data_dir: Path) -> None:
 
     # ── Ingest paths ─────────────────────────────────────────────────
 
-    def ingest_run_dir(self, ingest_run_srn: str) -> Path:
+    def ingest_run_dir(self, ingest_run_id: str) -> Path:
         """Root directory for an ingest run."""
-        return self._data_dir / "ingests" / _safe_srn(ingest_run_srn)
+        return self._data_dir / "ingests" / ingest_run_id
 
-    def ingest_batch_dir(self, ingest_run_srn: str, batch_index: int) -> Path:
+    def ingest_batch_dir(self, ingest_run_id: str, batch_index: int) -> Path:
         """Directory for a specific batch within an ingest run."""
-        return self.ingest_run_dir(ingest_run_srn) / "batches" / str(batch_index)
+        return self.ingest_run_dir(ingest_run_id) / "batches" / str(batch_index)
 
-    def ingest_batch_ingester_dir(self, ingest_run_srn: str, batch_index: int) -> Path:
+    def ingest_batch_ingester_dir(self, ingest_run_id: str, batch_index: int) -> Path:
         """Ingester output directory (records.jsonl, files/) for a batch."""
-        return self.ingest_batch_dir(ingest_run_srn, batch_index) / "ingester"
+        return self.ingest_batch_dir(ingest_run_id, batch_index) / "ingester"
 
-    def ingest_batch_hook_dir(self, ingest_run_srn: str, batch_index: int, hook_name: str) -> Path:
+    def ingest_batch_hook_dir(self, ingest_run_id: str, batch_index: int, hook_name: str) -> Path:
         """Hook output directory for a batch."""
-        return self.ingest_batch_dir(ingest_run_srn, batch_index) / "hooks" / hook_name
+        return self.ingest_batch_dir(ingest_run_id, batch_index) / "hooks" / hook_name
 
-    def ingest_session_file(self, ingest_run_srn: str) -> Path:
+    def ingest_session_file(self, ingest_run_id: str) -> Path:
         """Session state file for ingester continuation."""
-        return self.ingest_run_dir(ingest_run_srn) / "session.json"
+        return self.ingest_run_dir(ingest_run_id) / "session.json"
diff --git a/server/tests/unit/domain/feature/test_insert_record_features.py b/server/tests/unit/domain/feature/test_insert_record_features.py
index dbd489e..9bb2a52 100644
--- a/server/tests/unit/domain/feature/test_insert_record_features.py
+++ b/server/tests/unit/domain/feature/test_insert_record_features.py
@@ -214,7 +214,7 @@ async def test_ingest_source_uses_source_fields(self):
             record_srn=_make_record_srn(),
             source=IngestSource(
                 id="run-123-pdb-456",
-                ingest_run_srn="urn:osa:localhost:val:run123",
+                ingest_run_id="run123",
                 upstream_source="pdb",
             ),
             metadata={"title": "Ingested"},
diff --git a/server/tests/unit/domain/ingest/test_ingest_run.py b/server/tests/unit/domain/ingest/test_ingest_run.py
index 7086d8c..34ab0dd 100644
--- a/server/tests/unit/domain/ingest/test_ingest_run.py
+++ b/server/tests/unit/domain/ingest/test_ingest_run.py
@@ -10,7 +10,7 @@
 
 def _make_run(**overrides) -> IngestRun:
     defaults = {
-        "srn": "urn:osa:localhost:ing:test-run",
+        "id": "test-run-id",
         "convention_srn": "urn:osa:localhost:conv:test-conv@1.0.0",
         "status": IngestStatus.PENDING,
         "started_at": datetime.now(UTC),
diff --git a/server/tests/unit/domain/ingest/test_ingest_service.py b/server/tests/unit/domain/ingest/test_ingest_service.py
index 1bb9b0f..7e1c8df 100644
--- a/server/tests/unit/domain/ingest/test_ingest_service.py
+++ b/server/tests/unit/domain/ingest/test_ingest_service.py
@@ -74,13 +74,13 @@ async def test_saves_and_emits_events(self) -> None:
         # First event: IngestRunStarted (observability)
         first_event = service.outbox.append.call_args_list[0][0][0]
         assert first_event.__class__.__name__ == "IngestRunStarted"
-        assert first_event.ingest_run_srn == run.srn
+        assert first_event.ingest_run_id == run.id
         assert first_event.convention_srn == run.convention_srn
 
         # Second event: NextBatchRequested (triggers first batch)
         second_event = service.outbox.append.call_args_list[1][0][0]
         assert second_event.__class__.__name__ == "NextBatchRequested"
-        assert second_event.ingest_run_srn == run.srn
+        assert second_event.ingest_run_id == run.id
         assert second_event.convention_srn == run.convention_srn
 
     @pytest.mark.asyncio
diff --git a/server/tests/unit/domain/record/test_record_service.py b/server/tests/unit/domain/record/test_record_service.py
index 9dff7c1..e62c2cd 100644
--- a/server/tests/unit/domain/record/test_record_service.py
+++ b/server/tests/unit/domain/record/test_record_service.py
@@ -138,7 +138,7 @@ async def test_publish_with_ingest_source(
         draft = RecordDraft(
             source=IngestSource(
                 id="run-123-pdb-456",
-                ingest_run_srn="urn:osa:localhost:val:run123",
+                ingest_run_id="run123",
                 upstream_source="pdb",
             ),
             metadata={"title": "Ingested Protein"},
diff --git a/server/tests/unit/domain/shared/test_record_source.py b/server/tests/unit/domain/shared/test_record_source.py
index c1fe20d..64497e1 100644
--- a/server/tests/unit/domain/shared/test_record_source.py
+++ b/server/tests/unit/domain/shared/test_record_source.py
@@ -31,23 +31,23 @@ class TestIngestSource:
     def test_type_is_ingest(self):
         src = IngestSource(
             id="run-123-source-456",
-            ingest_run_srn="urn:osa:localhost:val:run123",
+            ingest_run_id="run123",
             upstream_source="pdb",
         )
         assert src.type == "ingest"
 
-    def test_requires_ingest_run_srn(self):
+    def test_requires_ingest_run_id(self):
         with pytest.raises(ValidationError):
             IngestSource(id="run-123", upstream_source="pdb")
 
     def test_requires_upstream_source(self):
         with pytest.raises(ValidationError):
-            IngestSource(id="run-123", ingest_run_srn="urn:osa:localhost:val:run123")
+            IngestSource(id="run-123", ingest_run_id="run123")
 
     def test_serialization_roundtrip(self):
         src = IngestSource(
             id="run-123-source-456",
-            ingest_run_srn="urn:osa:localhost:val:run123",
+            ingest_run_id="run123",
             upstream_source="pdb",
         )
         data = src.model_dump()
@@ -65,7 +65,7 @@ def test_deserializes_ingest(self):
         data = {
             "type": "ingest",
             "id": "run-123",
-            "ingest_run_srn": "urn:osa:localhost:val:run1",
+            "ingest_run_id": "urn:osa:localhost:val:run1",
             "upstream_source": "geo",
         }
         adapter = TypeAdapter(RecordSource)
@@ -83,7 +83,7 @@ def test_json_roundtrip(self):
         adapter = TypeAdapter(RecordSource)
         src = IngestSource(
             id="run-1",
-            ingest_run_srn="urn:osa:localhost:val:run1",
+            ingest_run_id="run1",
             upstream_source="pdb",
         )
         json_str = adapter.dump_json(src)
diff --git a/server/tests/unit/infrastructure/k8s/test_k8s_hook_runner.py b/server/tests/unit/infrastructure/k8s/test_k8s_hook_runner.py
index d03b972..ead5ad1 100644
--- a/server/tests/unit/infrastructure/k8s/test_k8s_hook_runner.py
+++ b/server/tests/unit/infrastructure/k8s/test_k8s_hook_runner.py
@@ -215,7 +215,8 @@ def test_labels(self):
         labels = spec.spec.template.metadata.labels
         assert labels["osa.io/role"] == "hook"
         assert labels["osa.io/hook"] == "validate_dna"
-        assert labels["osa.io/run-id"] == "run-abc123"
+        assert labels["osa.io/ingest-run-id"] == "run-abc123"
+        assert labels["osa.io/ingest-run-batch"] == "0"
 
     def test_human_readable_job_name(self):
         runner = _make_runner()
@@ -914,4 +915,5 @@ async def test_run_uses_run_id_from_inputs(self, tmp_path: Path):
         call_args = batch_api.create_namespaced_job.call_args
         spec = call_args[0][1]  # positional arg: (namespace, spec)
         labels = spec.metadata.labels
-        assert labels["osa.io/run-id"] == "my-real-run-id"
+        assert labels["osa.io/ingest-run-id"] == "my-real-run-id"
+        assert labels["osa.io/ingest-run-batch"] == "0"

From ce7b6257fb552586c4ed1c11b73ed1f299bb214b Mon Sep 17 00:00:00 2001
From: Rory Byrne <rory@rory.bio>
Date: Sun, 5 Apr 2026 20:52:49 +0100
Subject: [PATCH 03/13] refactor: simplify ingest completion check logic

Remove redundant is_complete property check and inline datetime
creation to streamline completion validation flow

feat: add clarifying comment for ingester exit code handling

Explain that non-zero exit codes are treated as transient errors
for ingesters due to upstream API failures vs permanent hook errors
---
 server/osa/domain/ingest/service/ingest.py       | 4 +---
 server/osa/infrastructure/k8s/ingester_runner.py | 3 +++
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/server/osa/domain/ingest/service/ingest.py b/server/osa/domain/ingest/service/ingest.py
index f009d77..7575915 100644
--- a/server/osa/domain/ingest/service/ingest.py
+++ b/server/osa/domain/ingest/service/ingest.py
@@ -122,10 +122,8 @@ async def fail_ingestion(self, ingest_run_id: IngestRunId) -> None:
 
     async def _check_completion(self, ingest_run: IngestRun) -> None:
         """Transition to COMPLETED and emit IngestCompleted if all batches are accounted for."""
-        if not ingest_run.is_complete:
+        if not ingest_run.check_completion(datetime.now(UTC)):
             return
-        now = datetime.now(UTC)
-        ingest_run.check_completion(now)
         await self.ingest_repo.save(ingest_run)
         await self.outbox.append(
             IngestCompleted(
diff --git a/server/osa/infrastructure/k8s/ingester_runner.py b/server/osa/infrastructure/k8s/ingester_runner.py
index d505272..652523b 100644
--- a/server/osa/infrastructure/k8s/ingester_runner.py
+++ b/server/osa/infrastructure/k8s/ingester_runner.py
@@ -414,6 +414,9 @@ async def _diagnose_failure(
                                 return OOMError("Source killed by OOM")
                             exit_code = getattr(terminated, "exit_code", -1)
                             if exit_code != 0:
+                                # Transient: ingester non-zero exit is often an upstream
+                                # API failure (500, rate limit), not a code bug.
+                                # Contrast with hooks where non-zero = PermanentError.
                                 return TransientError(f"Source exited with code {exit_code}")
         except Exception:
             pass

From 408c9bd92a7350d713fb03cab51ce1ec5dfd4e8a Mon Sep 17 00:00:00 2001
From: Rory Byrne <rory@rory.bio>
Date: Sun, 5 Apr 2026 21:45:46 +0100
Subject: [PATCH 04/13] feat: add backpressure control and container log
 capture

Add backpressure mechanism to prevent ingester from overwhelming hook
processing by limiting pending batches and deferring next batch
requests when threshold is exceeded.

Add container log capture functionality to both ingester and hook
runners for improved failure diagnostics, with implementation for
Kubernetes environments and stub for OCI containers.

Update outbox to support delayed event delivery for backpressure
implementation.
---
 server/osa/domain/ingest/handler/run_hooks.py |  5 +++-
 .../osa/domain/ingest/handler/run_ingester.py | 30 +++++++++++++++++++
 server/osa/domain/shared/outbox.py            |  7 +++--
 .../domain/shared/port/event_repository.py    |  2 ++
 .../osa/domain/shared/port/ingester_runner.py | 10 +++++++
 .../osa/domain/validation/port/hook_runner.py |  9 ++++++
 .../osa/infrastructure/k8s/ingester_runner.py | 24 +++++++++++++++
 server/osa/infrastructure/k8s/runner.py       | 22 ++++++++++++++
 .../osa/infrastructure/oci/ingester_runner.py |  4 +++
 server/osa/infrastructure/oci/runner.py       |  4 +++
 .../persistence/repository/event.py           |  2 ++
 .../shared/test_outbox_consumer_groups.py     |  6 ++--
 .../domain/validation/test_hook_runner.py     |  6 ++++
 13 files changed, 126 insertions(+), 5 deletions(-)

diff --git a/server/osa/domain/ingest/handler/run_hooks.py b/server/osa/domain/ingest/handler/run_hooks.py
index 5023a35..56ea861 100644
--- a/server/osa/domain/ingest/handler/run_hooks.py
+++ b/server/osa/domain/ingest/handler/run_hooks.py
@@ -14,7 +14,7 @@
 from osa.domain.shared.model.srn import ConventionSRN
 from osa.domain.shared.outbox import Outbox
 from osa.domain.validation.model.hook_input import HookRecord
-from osa.domain.validation.port.hook_runner import HookInputs
+from osa.domain.validation.port.hook_runner import HookInputs, HookRunner
 from osa.domain.validation.service.hook import HookService
 from osa.infrastructure.logging import get_logger
 
@@ -30,6 +30,7 @@ class RunHooks(EventHandler[IngesterBatchReady]):
     ingest_service: IngestService
     convention_service: ConventionService
     hook_service: HookService
+    hook_runner: HookRunner
     outbox: Outbox
     ingest_storage: IngestStoragePort
 
@@ -89,11 +90,13 @@ async def handle(self, event: IngesterBatchReady) -> None:
                 work_dirs=work_dirs,
             )
         except PermanentError as e:
+            container_logs = await self.hook_runner.capture_logs(inputs.run_id)
             log.error(
                 "[{short_id}] batch {batch_index} permanently failed: {error}",
                 short_id=event.ingest_run_id[:8],
                 batch_index=event.batch_index,
                 error=str(e),
+                container_logs=container_logs,
                 ingest_run_id=event.ingest_run_id,
             )
             await self._fail_batch(event)
diff --git a/server/osa/domain/ingest/handler/run_ingester.py b/server/osa/domain/ingest/handler/run_ingester.py
index a9d397d..f0d1134 100644
--- a/server/osa/domain/ingest/handler/run_ingester.py
+++ b/server/osa/domain/ingest/handler/run_ingester.py
@@ -1,5 +1,6 @@
 """RunIngester — runs ingester container on NextBatchRequested."""
 
+from datetime import UTC, datetime, timedelta
 from uuid import uuid4
 
 from osa.domain.deposition.service.convention import ConventionService
@@ -15,6 +16,9 @@
 from osa.domain.shared.port.ingester_runner import IngesterInputs, IngesterRunner
 from osa.infrastructure.logging import get_logger
 
+BACKPRESSURE_DELAY = timedelta(seconds=60)
+MAX_PENDING_BATCHES = 1
+
 log = get_logger(__name__)
 
 
@@ -35,6 +39,28 @@ async def handle(self, event: NextBatchRequested) -> None:
         if ingest_run is None:
             raise NotFoundError(f"Ingest run not found: {event.ingest_run_id}")
 
+        # Backpressure: don't ingest faster than hooks can process
+        pending = (
+            ingest_run.batches_ingested - ingest_run.batches_completed - ingest_run.batches_failed
+        )
+        if pending >= MAX_PENDING_BATCHES:
+            log.info(
+                "[{short_id}] backpressure: {pending} batches pending, deferring next pull",
+                short_id=event.ingest_run_id[:8],
+                pending=pending,
+                ingest_run_id=event.ingest_run_id,
+            )
+            await self.outbox.append(
+                NextBatchRequested(
+                    id=EventId(uuid4()),
+                    ingest_run_id=event.ingest_run_id,
+                    convention_srn=event.convention_srn,
+                    batch_size=event.batch_size,
+                ),
+                deliver_after=datetime.now(UTC) + BACKPRESSURE_DELAY,
+            )
+            return
+
         if ingest_run.status == IngestStatus.PENDING:
             ingest_run.mark_running()
             await self.ingest_repo.save(ingest_run)
@@ -69,6 +95,8 @@ async def handle(self, event: NextBatchRequested) -> None:
 
         inputs = IngesterInputs(
             convention_srn=convention.srn,
+            ingest_run_id=event.ingest_run_id,
+            batch_index=batch_index,
             config=convention.ingester.config,
             limit=effective_batch_limit,
             session=session,
@@ -84,10 +112,12 @@ async def handle(self, event: NextBatchRequested) -> None:
                 work_dir=work_dir,
             )
         except PermanentError as e:
+            container_logs = await self.ingester_runner.capture_logs(event.ingest_run_id)
             log.error(
                 "[{short_id}] ingester permanently failed: {error}",
                 short_id=event.ingest_run_id[:8],
                 error=str(e),
+                container_logs=container_logs,
                 ingest_run_id=event.ingest_run_id,
             )
             await self._fail_ingestion(event)
diff --git a/server/osa/domain/shared/outbox.py b/server/osa/domain/shared/outbox.py
index 8ac7d51..8659f3c 100644
--- a/server/osa/domain/shared/outbox.py
+++ b/server/osa/domain/shared/outbox.py
@@ -25,7 +25,7 @@ class Outbox(Service):
     _repo: EventRepository
     _registry: SubscriptionRegistry
 
-    async def append(self, event: Event) -> None:
+    async def append(self, event: Event, *, deliver_after: datetime | None = None) -> None:
         """Add an event to the outbox for delivery.
 
         Creates one delivery row per consumer group subscribed to this event type.
@@ -33,10 +33,13 @@ async def append(self, event: Event) -> None:
 
         Args:
             event: The event to append.
+            deliver_after: If set, deliveries won't be claimed until this time.
         """
         event_type_name = type(event).__name__
         consumer_groups = self._registry.get(event_type_name, set())
-        await self._repo.save_with_deliveries(event, consumer_groups=consumer_groups)
+        await self._repo.save_with_deliveries(
+            event, consumer_groups=consumer_groups, deliver_after=deliver_after
+        )
 
     async def claim(
         self,
diff --git a/server/osa/domain/shared/port/event_repository.py b/server/osa/domain/shared/port/event_repository.py
index d0242cb..f963f3c 100644
--- a/server/osa/domain/shared/port/event_repository.py
+++ b/server/osa/domain/shared/port/event_repository.py
@@ -19,6 +19,7 @@ async def save_with_deliveries(
         self,
         event: Event,
         consumer_groups: set[str],
+        deliver_after: datetime | None = None,
     ) -> None:
         """Save event to the append-only log and create delivery rows.
 
@@ -26,6 +27,7 @@ async def save_with_deliveries(
             event: The event to persist.
             consumer_groups: Set of consumer group names to create deliveries for.
                 If empty, the event is saved without any delivery rows (audit-only).
+            deliver_after: If set, deliveries won't be claimed until this time.
         """
         ...
 
diff --git a/server/osa/domain/shared/port/ingester_runner.py b/server/osa/domain/shared/port/ingester_runner.py
index c4bb7a0..fcf7d66 100644
--- a/server/osa/domain/shared/port/ingester_runner.py
+++ b/server/osa/domain/shared/port/ingester_runner.py
@@ -20,6 +20,8 @@ class IngesterInputs:
     """Inputs for an ingester container run."""
 
     convention_srn: ConventionSRN
+    ingest_run_id: str = ""
+    batch_index: int = 0
     config: dict[str, Any] | None = None
     since: datetime | None = None
     limit: int | None = None
@@ -46,3 +48,11 @@ async def run(
         files_dir: Path,
         work_dir: Path,
     ) -> IngesterOutput: ...
+
+    async def capture_logs(self, run_id: str) -> str:
+        """Capture recent container logs for a run.
+
+        Returns the last few lines of container/pod output, or empty string
+        if logs are unavailable. Used for failure diagnostics.
+        """
+        ...
diff --git a/server/osa/domain/validation/port/hook_runner.py b/server/osa/domain/validation/port/hook_runner.py
index 55ff484..6ee2c89 100644
--- a/server/osa/domain/validation/port/hook_runner.py
+++ b/server/osa/domain/validation/port/hook_runner.py
@@ -43,3 +43,12 @@ async def run(
         ``input/`` is ephemeral (cleaned after run); ``output/`` persists for later reading.
         """
         ...
+
+    @abstractmethod
+    async def capture_logs(self, run_id: str) -> str:
+        """Capture recent container logs for a run.
+
+        Returns the last few lines of container/pod output, or empty string
+        if logs are unavailable. Used for failure diagnostics.
+        """
+        ...
diff --git a/server/osa/infrastructure/k8s/ingester_runner.py b/server/osa/infrastructure/k8s/ingester_runner.py
index 652523b..99671bd 100644
--- a/server/osa/infrastructure/k8s/ingester_runner.py
+++ b/server/osa/infrastructure/k8s/ingester_runner.py
@@ -59,6 +59,23 @@ def _s3_prefix(self, work_dir: Path, subdir: str) -> str:
         """Convert a PVC path + subdir to an S3 key prefix."""
         return f"{relative_path(work_dir, self._config.data_mount_path)}/{subdir}"
 
+    async def capture_logs(self, run_id: str) -> str:
+        """Capture recent pod logs for an ingester Job identified by run_id."""
+        namespace = self._config.namespace
+        label_selector = f"osa.io/role=ingester,osa.io/ingest-run-id={run_id}"
+        try:
+            pod_list = await self._core_api.list_namespaced_pod(
+                namespace, label_selector=label_selector
+            )
+            for pod in pod_list.items:
+                log_str = await self._core_api.read_namespaced_pod_log(
+                    pod.metadata.name, namespace, tail_lines=10
+                )
+                return log_str.strip() if log_str else ""
+        except Exception:
+            return ""
+        return ""
+
     async def run(
         self,
         ingester: IngesterDefinition,
@@ -101,6 +118,10 @@ async def _run_job(
             if existing and existing.startswith("active:"):
                 job_name_to_watch = existing.split(":", 1)[1]
             else:
+                # Clear stale output from previous failed runs
+                output_prefix = self._s3_prefix(work_dir, "output")
+                await self._s3.delete_objects(output_prefix)
+
                 spec = self._build_job_spec(
                     ingester,
                     work_dir=work_dir,
@@ -223,6 +244,9 @@ def _build_job_spec(
         }
         if convention_srn is not None:
             labels["osa.io/convention"] = label_value(convention_srn)
+        if inputs and inputs.ingest_run_id:
+            labels["osa.io/ingest-run-id"] = inputs.ingest_run_id
+            labels["osa.io/ingest-run-batch"] = str(inputs.batch_index)
 
         env = [
             V1EnvVar(name="OSA_IN", value="/osa/in"),
diff --git a/server/osa/infrastructure/k8s/runner.py b/server/osa/infrastructure/k8s/runner.py
index 4568ca8..db1e796 100644
--- a/server/osa/infrastructure/k8s/runner.py
+++ b/server/osa/infrastructure/k8s/runner.py
@@ -59,6 +59,28 @@ def _s3_prefix(self, work_dir: Path, subdir: str) -> str:
         """Convert a PVC path + subdir to an S3 key prefix."""
         return f"{relative_path(work_dir, self._config.data_mount_path)}/{subdir}"
 
+    async def capture_logs(self, run_id: str) -> str:
+        """Capture recent pod logs for a hook Job identified by run_id."""
+        namespace = self._config.namespace
+        # Find the Job by label — run_id format is {uuid}_b{batch}
+        ingest_run_id = run_id.split("_b", 1)[0]
+        batch_index = run_id.split("_b", 1)[1] if "_b" in run_id else "0"
+        label_selector = (
+            f"osa.io/ingest-run-id={ingest_run_id},osa.io/ingest-run-batch={batch_index}"
+        )
+        try:
+            pod_list = await self._core_api.list_namespaced_pod(
+                namespace, label_selector=label_selector
+            )
+            for pod in pod_list.items:
+                log_str = await self._core_api.read_namespaced_pod_log(
+                    pod.metadata.name, namespace, tail_lines=10
+                )
+                return log_str.strip() if log_str else ""
+        except Exception:
+            return ""
+        return ""
+
     async def run(
         self,
         hook: HookDefinition,
diff --git a/server/osa/infrastructure/oci/ingester_runner.py b/server/osa/infrastructure/oci/ingester_runner.py
index 3527cf9..51e38b7 100644
--- a/server/osa/infrastructure/oci/ingester_runner.py
+++ b/server/osa/infrastructure/oci/ingester_runner.py
@@ -47,6 +47,10 @@ def __init__(
         self._host_data_dir = host_data_dir
         self._container_data_dir = container_data_dir
 
+    async def capture_logs(self, run_id: str) -> str:
+        """OCI containers are deleted after run — logs captured inline during execution."""
+        return ""
+
     async def run(
         self,
         ingester: IngesterDefinition,
diff --git a/server/osa/infrastructure/oci/runner.py b/server/osa/infrastructure/oci/runner.py
index 88738dd..f114be4 100644
--- a/server/osa/infrastructure/oci/runner.py
+++ b/server/osa/infrastructure/oci/runner.py
@@ -44,6 +44,10 @@ def __init__(
         self._host_data_dir = host_data_dir
         self._container_data_dir = container_data_dir
 
+    async def capture_logs(self, run_id: str) -> str:
+        """OCI containers are deleted after run — logs captured inline during execution."""
+        return ""
+
     async def run(
         self,
         hook: HookDefinition,
diff --git a/server/osa/infrastructure/persistence/repository/event.py b/server/osa/infrastructure/persistence/repository/event.py
index e3c86e6..91eb200 100644
--- a/server/osa/infrastructure/persistence/repository/event.py
+++ b/server/osa/infrastructure/persistence/repository/event.py
@@ -32,6 +32,7 @@ async def save_with_deliveries(
         self,
         event: Event,
         consumer_groups: set[str],
+        deliver_after: datetime | None = None,
     ) -> None:
         """Save event to append-only log and create delivery rows."""
         now = datetime.now(UTC)
@@ -53,6 +54,7 @@ async def save_with_deliveries(
                 consumer_group=group,
                 status="pending",
                 retry_count=0,
+                deliver_after=deliver_after,
                 updated_at=now,
             )
             await self._session.execute(delivery_stmt)
diff --git a/server/tests/unit/domain/shared/test_outbox_consumer_groups.py b/server/tests/unit/domain/shared/test_outbox_consumer_groups.py
index 5958ff1..322d76d 100644
--- a/server/tests/unit/domain/shared/test_outbox_consumer_groups.py
+++ b/server/tests/unit/domain/shared/test_outbox_consumer_groups.py
@@ -57,7 +57,7 @@ async def test_append_creates_deliveries_for_subscribed_groups(
         await outbox.append(event)
 
         mock_repo.save_with_deliveries.assert_called_once_with(
-            event, consumer_groups={"HandlerA", "HandlerB"}
+            event, consumer_groups={"HandlerA", "HandlerB"}, deliver_after=None
         )
 
     async def test_append_audit_only_event_creates_zero_deliveries(
@@ -73,7 +73,9 @@ class AuditEvent(Event):
 
         await outbox.append(event)
 
-        mock_repo.save_with_deliveries.assert_called_once_with(event, consumer_groups=set())
+        mock_repo.save_with_deliveries.assert_called_once_with(
+            event, consumer_groups=set(), deliver_after=None
+        )
 
 
 class TestOutboxClaimByConsumerGroup:
diff --git a/server/tests/unit/domain/validation/test_hook_runner.py b/server/tests/unit/domain/validation/test_hook_runner.py
index 5b7052c..2536ff0 100644
--- a/server/tests/unit/domain/validation/test_hook_runner.py
+++ b/server/tests/unit/domain/validation/test_hook_runner.py
@@ -89,6 +89,9 @@ async def run(
                     duration_seconds=0.1,
                 )
 
+            async def capture_logs(self, run_id: str) -> str:
+                return ""
+
         assert isinstance(FakeRunner(), HookRunner)
 
     def test_incomplete_class_does_not_satisfy_protocol(self):
@@ -106,5 +109,8 @@ class LaxRunner:
             async def run(self, *args, **kwargs):
                 pass
 
+            async def capture_logs(self, *args, **kwargs):
+                pass
+
         # runtime_checkable only checks method names exist, not signatures
         assert isinstance(LaxRunner(), HookRunner)

From 1fdca757ffef97fe18055a0fbab6b95f9cab3cf7 Mon Sep 17 00:00:00 2001
From: Rory Byrne <rory@rory.bio>
Date: Sun, 5 Apr 2026 22:05:16 +0100
Subject: [PATCH 05/13] feat: increase MAX_PENDING_BATCHES from 1 to 4 to
 improve ingestion throughput

---
 server/osa/domain/ingest/handler/run_ingester.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/osa/domain/ingest/handler/run_ingester.py b/server/osa/domain/ingest/handler/run_ingester.py
index f0d1134..2172949 100644
--- a/server/osa/domain/ingest/handler/run_ingester.py
+++ b/server/osa/domain/ingest/handler/run_ingester.py
@@ -17,7 +17,7 @@
 from osa.infrastructure.logging import get_logger
 
 BACKPRESSURE_DELAY = timedelta(seconds=60)
-MAX_PENDING_BATCHES = 1
+MAX_PENDING_BATCHES = 4
 
 log = get_logger(__name__)
 

From 21ada51ef56dacc35e315dcb2d841a29027d1daf Mon Sep 17 00:00:00 2001
From: Rory Byrne <rory@rory.bio>
Date: Sun, 5 Apr 2026 22:33:08 +0100
Subject: [PATCH 06/13] feat: add max retries configuration and improve
 backpressure logging

Add __max_retries__ = 100 to RunHooks handler to configure retry
behavior and enhance backpressure log message to include delay
duration for better observability
---
 server/osa/domain/ingest/handler/run_hooks.py    | 1 +
 server/osa/domain/ingest/handler/run_ingester.py | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/server/osa/domain/ingest/handler/run_hooks.py b/server/osa/domain/ingest/handler/run_hooks.py
index 56ea861..49da984 100644
--- a/server/osa/domain/ingest/handler/run_hooks.py
+++ b/server/osa/domain/ingest/handler/run_hooks.py
@@ -25,6 +25,7 @@ class RunHooks(EventHandler[IngesterBatchReady]):
     """Runs hook containers on an ingester batch and emits HookBatchCompleted."""
 
     __claim_timeout__ = 3600.0
+    __max_retries__ = 100
 
     ingest_repo: IngestRunRepository
     ingest_service: IngestService
diff --git a/server/osa/domain/ingest/handler/run_ingester.py b/server/osa/domain/ingest/handler/run_ingester.py
index 2172949..0a7821e 100644
--- a/server/osa/domain/ingest/handler/run_ingester.py
+++ b/server/osa/domain/ingest/handler/run_ingester.py
@@ -45,9 +45,10 @@ async def handle(self, event: NextBatchRequested) -> None:
         )
         if pending >= MAX_PENDING_BATCHES:
             log.info(
-                "[{short_id}] backpressure: {pending} batches pending, deferring next pull",
+                "[{short_id}] backpressure: {pending} batches pending, deferring next pull for {delay}s",
                 short_id=event.ingest_run_id[:8],
                 pending=pending,
+                delay=int(BACKPRESSURE_DELAY.total_seconds()),
                 ingest_run_id=event.ingest_run_id,
             )
             await self.outbox.append(

From 1d92124e86fa17c00bb1eae4e09730193977575f Mon Sep 17 00:00:00 2001
From: Rory Byrne <rory@rory.bio>
Date: Sun, 5 Apr 2026 22:36:34 +0100
Subject: [PATCH 07/13] refactor: replace batch count backpressure with cluster
 capacity check

Replace MAX_PENDING_BATCHES constant with dynamic cluster capacity
checking to prevent job submission when cluster cannot schedule more
work, improving resource utilization and avoiding timeouts
---
 .../osa/domain/ingest/handler/run_ingester.py | 11 ++---
 .../osa/domain/shared/port/ingester_runner.py |  8 ++++
 .../osa/infrastructure/k8s/ingester_runner.py | 40 ++++++++++++-------
 .../osa/infrastructure/oci/ingester_runner.py |  4 ++
 4 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/server/osa/domain/ingest/handler/run_ingester.py b/server/osa/domain/ingest/handler/run_ingester.py
index 0a7821e..048bfdc 100644
--- a/server/osa/domain/ingest/handler/run_ingester.py
+++ b/server/osa/domain/ingest/handler/run_ingester.py
@@ -17,7 +17,6 @@
 from osa.infrastructure.logging import get_logger
 
 BACKPRESSURE_DELAY = timedelta(seconds=60)
-MAX_PENDING_BATCHES = 4
 
 log = get_logger(__name__)
 
@@ -39,15 +38,11 @@ async def handle(self, event: NextBatchRequested) -> None:
         if ingest_run is None:
             raise NotFoundError(f"Ingest run not found: {event.ingest_run_id}")
 
-        # Backpressure: don't ingest faster than hooks can process
-        pending = (
-            ingest_run.batches_ingested - ingest_run.batches_completed - ingest_run.batches_failed
-        )
-        if pending >= MAX_PENDING_BATCHES:
+        # Backpressure: don't ingest if the cluster can't schedule more Jobs
+        if not await self.ingester_runner.has_capacity():
             log.info(
-                "[{short_id}] backpressure: {pending} batches pending, deferring next pull for {delay}s",
+                "[{short_id}] backpressure: cluster has pending Jobs, deferring next pull for {delay}s",
                 short_id=event.ingest_run_id[:8],
-                pending=pending,
                 delay=int(BACKPRESSURE_DELAY.total_seconds()),
                 ingest_run_id=event.ingest_run_id,
             )
diff --git a/server/osa/domain/shared/port/ingester_runner.py b/server/osa/domain/shared/port/ingester_runner.py
index fcf7d66..4b51d14 100644
--- a/server/osa/domain/shared/port/ingester_runner.py
+++ b/server/osa/domain/shared/port/ingester_runner.py
@@ -56,3 +56,11 @@ async def capture_logs(self, run_id: str) -> str:
         if logs are unavailable. Used for failure diagnostics.
         """
         ...
+
+    async def has_capacity(self) -> bool:
+        """Check whether the cluster can schedule more Jobs.
+
+        Returns False if there are pending (unschedulable) Jobs in the namespace.
+        Used by the ingester to avoid submitting work that will just timeout.
+        """
+        ...
diff --git a/server/osa/infrastructure/k8s/ingester_runner.py b/server/osa/infrastructure/k8s/ingester_runner.py
index 99671bd..510a52d 100644
--- a/server/osa/infrastructure/k8s/ingester_runner.py
+++ b/server/osa/infrastructure/k8s/ingester_runner.py
@@ -4,7 +4,6 @@
 
 import asyncio
 import json
-import logging
 import time
 from pathlib import Path
 from typing import TYPE_CHECKING
@@ -20,6 +19,7 @@
 from osa.domain.shared.model.srn import ConventionSRN
 from osa.domain.shared.port.ingester_runner import IngesterInputs, IngesterOutput, IngesterRunner
 from osa.infrastructure.k8s.errors import classify_api_error
+from osa.infrastructure.logging import get_logger
 from osa.infrastructure.k8s.naming import job_name, label_value, sanitize_label
 from osa.infrastructure.runner_utils import (
     relative_path,
@@ -31,7 +31,7 @@
 
     from osa.infrastructure.s3.client import S3Client
 
-logger = logging.getLogger(__name__)
+logger = get_logger(__name__)
 
 SCHEDULING_TIMEOUT = 120
 
@@ -59,6 +59,20 @@ def _s3_prefix(self, work_dir: Path, subdir: str) -> str:
         """Convert a PVC path + subdir to an S3 key prefix."""
         return f"{relative_path(work_dir, self._config.data_mount_path)}/{subdir}"
 
+    async def has_capacity(self) -> bool:
+        """Check for pending (unschedulable) pods in the namespace."""
+        namespace = self._config.namespace
+        try:
+            pod_list = await self._core_api.list_namespaced_pod(
+                namespace, field_selector="status.phase=Pending"
+            )
+            return len(pod_list.items) == 0
+        except Exception as e:
+            logger.warn(
+                "Failed to check cluster capacity: {error} — assuming capacity", error=str(e)
+            )
+            return True
+
     async def capture_logs(self, run_id: str) -> str:
         """Capture recent pod logs for an ingester Job identified by run_id."""
         namespace = self._config.namespace
@@ -133,12 +147,10 @@ async def _run_job(
 
                 await self._batch_api.create_namespaced_job(namespace, spec)
                 logger.info(
-                    "Created K8s ingester Job",
-                    extra={
-                        "job_name": job_name_to_watch,
-                        "namespace": namespace,
-                        "image": f"{ingester.image}@{ingester.digest}",
-                    },
+                    "Created K8s ingester Job: {job_name}",
+                    job_name=job_name_to_watch,
+                    namespace=namespace,
+                    image=f"{ingester.image}@{ingester.digest}",
                 )
 
             # Phase 1: Scheduling
@@ -153,12 +165,10 @@ async def _run_job(
 
             output = await self._parse_source_output(work_dir, files_dir)
             logger.info(
-                "Source completed",
-                extra={
-                    "job_name": job_name_to_watch,
-                    "record_count": len(output.records),
-                    "has_session": output.session is not None,
-                },
+                "Source completed: {job_name} ({record_count} records)",
+                job_name=job_name_to_watch,
+                record_count=len(output.records),
+                has_session=output.session is not None,
             )
             return output
 
@@ -457,4 +467,4 @@ async def _cleanup_job(self, job_name: str, namespace: str) -> None:
         except Exception as exc:
             if getattr(exc, "status", None) == 404:
                 return
-            logger.warning("Failed to clean up K8s ingester Job", extra={"job_name": job_name})
+            logger.warn("Failed to clean up K8s ingester Job: {job_name}", job_name=job_name)
diff --git a/server/osa/infrastructure/oci/ingester_runner.py b/server/osa/infrastructure/oci/ingester_runner.py
index 51e38b7..e458d85 100644
--- a/server/osa/infrastructure/oci/ingester_runner.py
+++ b/server/osa/infrastructure/oci/ingester_runner.py
@@ -47,6 +47,10 @@ def __init__(
         self._host_data_dir = host_data_dir
         self._container_data_dir = container_data_dir
 
+    async def has_capacity(self) -> bool:
+        """Docker doesn't have scheduling contention."""
+        return True
+
     async def capture_logs(self, run_id: str) -> str:
         """OCI containers are deleted after run — logs captured inline during execution."""
         return ""

From f863670e0d3703d6276aac881f24025fd317c8df Mon Sep 17 00:00:00 2001
From: Rory Byrne <rory@rory.bio>
Date: Mon, 6 Apr 2026 00:12:03 +0100
Subject: [PATCH 08/13] fix: include tests directory in linting and formatting
 commands

refactor: remove unused HookRunner dependency from RunHooks handler

feat: add max_retries configuration to RunIngester event handler

refactor: capture container logs in PermanentError for better debugging

style: improve log message formatting for backpressure and retry delays

feat: add container_logs field to InfrastructureError base class

refactor: capture pod logs before cleanup in K8s runners for debugging

style: improve logging messages and use structured logging consistently

refactor: simplify module name abbreviation to SLF4J-style format
---
 server/Justfile                               |  8 +--
 server/osa/domain/ingest/handler/run_hooks.py |  6 +-
 .../osa/domain/ingest/handler/run_ingester.py |  6 +-
 server/osa/domain/shared/error.py             |  2 +
 server/osa/infrastructure/event/worker.py     |  4 +-
 .../osa/infrastructure/k8s/ingester_runner.py | 31 +++++++++-
 server/osa/infrastructure/k8s/runner.py       | 59 ++++++++++++++-----
 server/osa/infrastructure/logging.py          | 13 ++--
 8 files changed, 90 insertions(+), 39 deletions(-)

diff --git a/server/Justfile b/server/Justfile
index eb93bc9..6f96d22 100644
--- a/server/Justfile
+++ b/server/Justfile
@@ -37,17 +37,17 @@ test-cov:
 # Run linter and type checker
 lint:
     @just fix
-    uv run ruff check osa
+    uv run ruff check osa tests
     uv run ty check osa
 
 # Fix formatting and lint issues
 fix:
-    uv run ruff format osa
-    uv run ruff check --fix osa
+    uv run ruff format osa tests
+    uv run ruff check --fix osa tests
 
 # Format code
 format:
-    uv run ruff format osa
+    uv run ruff format osa tests
 
 # === Database ===
 
diff --git a/server/osa/domain/ingest/handler/run_hooks.py b/server/osa/domain/ingest/handler/run_hooks.py
index 49da984..658481b 100644
--- a/server/osa/domain/ingest/handler/run_hooks.py
+++ b/server/osa/domain/ingest/handler/run_hooks.py
@@ -14,7 +14,7 @@
 from osa.domain.shared.model.srn import ConventionSRN
 from osa.domain.shared.outbox import Outbox
 from osa.domain.validation.model.hook_input import HookRecord
-from osa.domain.validation.port.hook_runner import HookInputs, HookRunner
+from osa.domain.validation.port.hook_runner import HookInputs
 from osa.domain.validation.service.hook import HookService
 from osa.infrastructure.logging import get_logger
 
@@ -31,7 +31,6 @@ class RunHooks(EventHandler[IngesterBatchReady]):
     ingest_service: IngestService
     convention_service: ConventionService
     hook_service: HookService
-    hook_runner: HookRunner
     outbox: Outbox
     ingest_storage: IngestStoragePort
 
@@ -91,13 +90,12 @@ async def handle(self, event: IngesterBatchReady) -> None:
                 work_dirs=work_dirs,
             )
         except PermanentError as e:
-            container_logs = await self.hook_runner.capture_logs(inputs.run_id)
             log.error(
                 "[{short_id}] batch {batch_index} permanently failed: {error}",
                 short_id=event.ingest_run_id[:8],
                 batch_index=event.batch_index,
                 error=str(e),
-                container_logs=container_logs,
+                container_logs=e.container_logs or "",
                 ingest_run_id=event.ingest_run_id,
             )
             await self._fail_batch(event)
diff --git a/server/osa/domain/ingest/handler/run_ingester.py b/server/osa/domain/ingest/handler/run_ingester.py
index 048bfdc..9c9d369 100644
--- a/server/osa/domain/ingest/handler/run_ingester.py
+++ b/server/osa/domain/ingest/handler/run_ingester.py
@@ -25,6 +25,7 @@ class RunIngester(EventHandler[NextBatchRequested]):
     """Runs ingester container and emits IngesterBatchReady per batch."""
 
     __claim_timeout__ = 3600.0
+    __max_retries__ = 20
 
     ingest_repo: IngestRunRepository
     ingest_service: IngestService
@@ -41,7 +42,7 @@ async def handle(self, event: NextBatchRequested) -> None:
         # Backpressure: don't ingest if the cluster can't schedule more Jobs
         if not await self.ingester_runner.has_capacity():
             log.info(
-                "[{short_id}] backpressure: cluster has pending Jobs, deferring next pull for {delay}s",
+                "[{short_id}] backpressure: cluster has pending Jobs, deferring next pull +{delay}s",
                 short_id=event.ingest_run_id[:8],
                 delay=int(BACKPRESSURE_DELAY.total_seconds()),
                 ingest_run_id=event.ingest_run_id,
@@ -108,12 +109,11 @@ async def handle(self, event: NextBatchRequested) -> None:
                 work_dir=work_dir,
             )
         except PermanentError as e:
-            container_logs = await self.ingester_runner.capture_logs(event.ingest_run_id)
             log.error(
                 "[{short_id}] ingester permanently failed: {error}",
                 short_id=event.ingest_run_id[:8],
                 error=str(e),
-                container_logs=container_logs,
+                container_logs=e.container_logs or "",
                 ingest_run_id=event.ingest_run_id,
             )
             await self._fail_ingestion(event)
diff --git a/server/osa/domain/shared/error.py b/server/osa/domain/shared/error.py
index b4f334e..a26c0d6 100644
--- a/server/osa/domain/shared/error.py
+++ b/server/osa/domain/shared/error.py
@@ -59,6 +59,8 @@ class AuthorizationError(DomainError):
 class InfrastructureError(OSAError):
     """Base class for infrastructure/system errors."""
 
+    container_logs: str | None = None
+
 
 class StorageUnavailableError(InfrastructureError):
     """Storage backend (database, object store) is unavailable."""
diff --git a/server/osa/infrastructure/event/worker.py b/server/osa/infrastructure/event/worker.py
index 1b26ba5..733db35 100644
--- a/server/osa/infrastructure/event/worker.py
+++ b/server/osa/infrastructure/event/worker.py
@@ -216,11 +216,11 @@ async def _poll_once(self) -> bool:
                         deliver_after = datetime.now(UTC) + timedelta(seconds=backoff_seconds)
                         logger.warn(
                             "Worker '{name}' transient failure: {error} "
-                            "(attempt={attempt}, next_retry={next_retry})",
+                            "(attempt={attempt}, next_retry=+{backoff}s)",
                             name=self.name,
                             error=str(e),
                             attempt=delivery.retry_count,
-                            next_retry=deliver_after.isoformat(),
+                            backoff=backoff_seconds,
                         )
                         await outbox.mark_failed_with_retry(
                             delivery.id,
diff --git a/server/osa/infrastructure/k8s/ingester_runner.py b/server/osa/infrastructure/k8s/ingester_runner.py
index 510a52d..6257595 100644
--- a/server/osa/infrastructure/k8s/ingester_runner.py
+++ b/server/osa/infrastructure/k8s/ingester_runner.py
@@ -127,10 +127,15 @@ async def _run_job(
             )
 
             if existing == "succeeded":
+                logger.info("Reusing output from completed ingester Job")
                 return await self._parse_source_output(work_dir, files_dir)
 
             if existing and existing.startswith("active:"):
                 job_name_to_watch = existing.split(":", 1)[1]
+                logger.info(
+                    "Attaching to running ingester Job: {job_name}",
+                    job_name=job_name_to_watch,
+                )
             else:
                 # Clear stale output from previous failed runs
                 output_prefix = self._s3_prefix(work_dir, "output")
@@ -165,13 +170,19 @@ async def _run_job(
 
             output = await self._parse_source_output(work_dir, files_dir)
             logger.info(
-                "Source completed: {job_name} ({record_count} records)",
+                "Ingester batch completed: {job_name} ({record_count} records)",
                 job_name=job_name_to_watch,
                 record_count=len(output.records),
                 has_session=output.session is not None,
             )
             return output
 
+        except InfrastructureError as e:
+            # Capture logs before cleanup destroys the pod
+            if job_name_to_watch:
+                e.container_logs = await self._capture_pod_logs(job_name_to_watch, namespace)
+            raise
+
         finally:
             if job_name_to_watch:
                 await self._cleanup_job(job_name_to_watch, namespace)
@@ -424,13 +435,28 @@ async def _wait_for_completion(
 
         raise TransientError(f"Watch timeout waiting for ingester Job {job_name} completion")
 
+    async def _capture_pod_logs(self, job_name: str, namespace: str) -> str:
+        """Capture tail logs from a Job's pod. Returns empty if unavailable."""
+        try:
+            pod_list = await self._core_api.list_namespaced_pod(
+                namespace, label_selector=f"job-name={job_name}"
+            )
+            for pod in pod_list.items:
+                log_str = await self._core_api.read_namespaced_pod_log(
+                    pod.metadata.name, namespace, tail_lines=10
+                )
+                return log_str.strip() if log_str else ""
+        except Exception:
+            return ""
+        return ""
+
     async def _diagnose_failure(
         self,
         job_name: str,
         namespace: str,
         failure_info: str,
     ) -> InfrastructureError:
-        """Inspect pod status and return the appropriate exception."""
+        """Inspect pod status, capture logs, and return the appropriate exception."""
         if "DeadlineExceeded" in failure_info:
             return TransientError("Ingester timed out (deadline exceeded)")
 
@@ -464,6 +490,7 @@ async def _cleanup_job(self, job_name: str, namespace: str) -> None:
                 namespace,
                 propagation_policy="Background",
             )
+            logger.info("Cleaned up K8s ingester Job: {job_name}", job_name=job_name)
         except Exception as exc:
             if getattr(exc, "status", None) == 404:
                 return
diff --git a/server/osa/infrastructure/k8s/runner.py b/server/osa/infrastructure/k8s/runner.py
index db1e796..2aa1659 100644
--- a/server/osa/infrastructure/k8s/runner.py
+++ b/server/osa/infrastructure/k8s/runner.py
@@ -4,7 +4,6 @@
 
 import asyncio
 import json
-import logging
 import time
 from pathlib import Path
 from typing import TYPE_CHECKING
@@ -20,6 +19,7 @@
 from osa.domain.validation.model.hook_result import HookResult, HookStatus
 from osa.domain.validation.port.hook_runner import HookInputs, HookRunner
 from osa.infrastructure.k8s.errors import classify_api_error
+from osa.infrastructure.logging import get_logger
 from osa.infrastructure.k8s.naming import job_name
 from osa.infrastructure.runner_utils import (
     detect_rejection,
@@ -32,7 +32,7 @@
 
     from osa.infrastructure.s3.client import S3Client
 
-logger = logging.getLogger(__name__)
+logger = get_logger(__name__)
 
 SCHEDULING_TIMEOUT = 120  # seconds to wait for pod to leave Pending
 
@@ -115,12 +115,21 @@ async def _run_job(
             existing = await self._check_existing_job(namespace, hook.name, inputs.run_id)
 
             if existing == "succeeded":
-                # Read output from completed Job
+                logger.info(
+                    "Reusing output from completed hook Job (hook={hook_name}, run_id={run_id})",
+                    hook_name=hook.name,
+                    run_id=inputs.run_id,
+                )
                 return await self._parse_hook_result(hook, work_dir, start_time)
 
             if existing and existing.startswith("active:"):
                 # Attach to running Job
                 job_name_to_watch = existing.split(":", 1)[1]
+                logger.info(
+                    "Attaching to running hook Job: {job_name} (hook={hook_name})",
+                    job_name=job_name_to_watch,
+                    hook_name=hook.name,
+                )
             else:
                 # Create new Job (no existing or failed)
                 # Mount the parent of all per-record file dirs — works for
@@ -139,14 +148,10 @@ async def _run_job(
 
                 await self._batch_api.create_namespaced_job(namespace, spec)
                 logger.info(
-                    "Created K8s Job",
-                    extra={
-                        "job_name": job_name_to_watch,
-                        "namespace": namespace,
-                        "image": f"{hook.runtime.image}@{hook.runtime.digest}",
-                        "hook_name": hook.name,
-                        "run_id": inputs.run_id,
-                    },
+                    "Created K8s hook Job: {job_name} (hook={hook_name}, run_id={run_id})",
+                    job_name=job_name_to_watch,
+                    hook_name=hook.name,
+                    run_id=inputs.run_id,
                 )
 
             # Phase 1: Wait for scheduling
@@ -161,6 +166,12 @@ async def _run_job(
 
             return await self._parse_hook_result(hook, work_dir, start_time)
 
+        except InfrastructureError as e:
+            # Capture logs before cleanup destroys the pod
+            if job_name_to_watch:
+                e.container_logs = await self._capture_pod_logs(job_name_to_watch, namespace)
+            raise
+
         finally:
             if job_name_to_watch:
                 await self._cleanup_job(job_name_to_watch, namespace)
@@ -446,13 +457,28 @@ async def _wait_for_completion(
 
         raise TransientError(f"Watch timeout waiting for Job {job_name} completion")
 
+    async def _capture_pod_logs(self, job_name: str, namespace: str) -> str:
+        """Capture tail logs from a Job's pod. Returns empty if unavailable."""
+        try:
+            pod_list = await self._core_api.list_namespaced_pod(
+                namespace, label_selector=f"job-name={job_name}"
+            )
+            for pod in pod_list.items:
+                log_str = await self._core_api.read_namespaced_pod_log(
+                    pod.metadata.name, namespace, tail_lines=10
+                )
+                return log_str.strip() if log_str else ""
+        except Exception:
+            return ""
+        return ""
+
     async def _diagnose_failure(
         self,
         job_name: str,
         namespace: str,
         failure_info: str,
     ) -> InfrastructureError:
-        """Inspect pod status and return the appropriate exception."""
+        """Inspect pod status, capture logs, and return the appropriate exception."""
         if "DeadlineExceeded" in failure_info:
             return TransientError("Hook timed out (deadline exceeded)")
 
@@ -488,11 +514,12 @@ async def _cleanup_job(
                 namespace,
                 propagation_policy="Background",
             )
-            logger.info("Cleaned up K8s Job", extra={"job_name": job_name})
+            logger.info("Cleaned up K8s hook Job: {job_name}", job_name=job_name)
         except Exception as exc:
             if getattr(exc, "status", None) == 404:
                 return  # Already gone
-            logger.warning(
-                "Failed to clean up K8s Job",
-                extra={"job_name": job_name, "error": str(exc)},
+            logger.warn(
+                "Failed to clean up K8s hook Job: {job_name} ({error})",
+                job_name=job_name,
+                error=str(exc),
             )
diff --git a/server/osa/infrastructure/logging.py b/server/osa/infrastructure/logging.py
index 90070b0..e0de990 100644
--- a/server/osa/infrastructure/logging.py
+++ b/server/osa/infrastructure/logging.py
@@ -114,16 +114,13 @@ def _shorten_module(name: str) -> str:
     )
     if len(short) <= _MODULE_WIDTH:
         return short
-    # Truncate: keep first and last segment, abbreviate middle
+    # SLF4J-style: abbreviate all segments except the last to first char
     parts = short.split(".")
-    if len(parts) <= 2:
+    if len(parts) == 1:
         return short[:_MODULE_WIDTH]
-    # Keep first and last, drop middle segments until it fits
-    first, *middle, last = parts
-    while middle and len(f"{first}.{'.'.join(middle)}.{last}") > _MODULE_WIDTH:
-        middle.pop(0)
-    result = f"{first}.{'.'.join(middle)}.{last}" if middle else f"{first}.{last}"
-    return result[:_MODULE_WIDTH]
+    *prefixes, last = parts
+    abbreviated = ".".join(p[0] for p in prefixes) + "." + last
+    return abbreviated[:_MODULE_WIDTH]
 
 
 class Logger:

From ea55bb2470cb9aa8e6a57cc79fd331e364c58a4c Mon Sep 17 00:00:00 2001
From: Rory Byrne <rory@rory.bio>
Date: Fri, 10 Apr 2026 16:32:58 +0100
Subject: [PATCH 09/13] feat: enhance k8s job failure handling and cleanup

- Add deletion of stale files from previous failed runs in addition to
 output cleanup to prevent data corruption
- Improve error logging by explicitly logging container logs when jobs
 fail to aid in debugging and troubleshooting
---
 server/osa/infrastructure/k8s/ingester_runner.py | 13 +++++++++++--
 server/osa/infrastructure/k8s/runner.py          |  9 ++++++++-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/server/osa/infrastructure/k8s/ingester_runner.py b/server/osa/infrastructure/k8s/ingester_runner.py
index 6257595..4cfe493 100644
--- a/server/osa/infrastructure/k8s/ingester_runner.py
+++ b/server/osa/infrastructure/k8s/ingester_runner.py
@@ -137,9 +137,11 @@ async def _run_job(
                     job_name=job_name_to_watch,
                 )
             else:
-                # Clear stale output from previous failed runs
+                # Clear stale output and files from previous failed runs
                 output_prefix = self._s3_prefix(work_dir, "output")
                 await self._s3.delete_objects(output_prefix)
+                files_prefix = relative_path(files_dir, self._config.data_mount_path)
+                await self._s3.delete_objects(files_prefix)
 
                 spec = self._build_job_spec(
                     ingester,
@@ -180,7 +182,14 @@ async def _run_job(
         except InfrastructureError as e:
             # Capture logs before cleanup destroys the pod
             if job_name_to_watch:
-                e.container_logs = await self._capture_pod_logs(job_name_to_watch, namespace)
+                logs = await self._capture_pod_logs(job_name_to_watch, namespace)
+                e.container_logs = logs
+                if logs:
+                    logger.error(
+                        "Job {job_name} failed — container logs:\n{logs}",
+                        job_name=job_name_to_watch,
+                        logs=logs,
+                    )
             raise
 
         finally:
diff --git a/server/osa/infrastructure/k8s/runner.py b/server/osa/infrastructure/k8s/runner.py
index 2aa1659..23e59b7 100644
--- a/server/osa/infrastructure/k8s/runner.py
+++ b/server/osa/infrastructure/k8s/runner.py
@@ -169,7 +169,14 @@ async def _run_job(
         except InfrastructureError as e:
             # Capture logs before cleanup destroys the pod
             if job_name_to_watch:
-                e.container_logs = await self._capture_pod_logs(job_name_to_watch, namespace)
+                logs = await self._capture_pod_logs(job_name_to_watch, namespace)
+                e.container_logs = logs
+                if logs:
+                    logger.error(
+                        "Job {job_name} failed — container logs:\n{logs}",
+                        job_name=job_name_to_watch,
+                        logs=logs,
+                    )
             raise
 
         finally:

From eecf19597d2d24c7623d1f3e2c093571fa08e4e6 Mon Sep 17 00:00:00 2001
From: Rory Byrne <rory@rory.bio>
Date: Sun, 12 Apr 2026 15:35:36 +0100
Subject: [PATCH 10/13] feat: add retry exhaustion handling for batch publish
 failures

Add on_exhausted method to PublishBatch handler to account for failed
batches when publish retries are exhausted, ensuring proper ingest run
state tracking and preventing stuck ingestion processes.
---
 .../domain/ingest/handler/publish_batch.py    | 11 +++
 .../unit/domain/ingest/test_publish_batch.py  | 82 +++++++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100644 server/tests/unit/domain/ingest/test_publish_batch.py

diff --git a/server/osa/domain/ingest/handler/publish_batch.py b/server/osa/domain/ingest/handler/publish_batch.py
index f5b7086..4fe8edd 100644
--- a/server/osa/domain/ingest/handler/publish_batch.py
+++ b/server/osa/domain/ingest/handler/publish_batch.py
@@ -14,6 +14,7 @@
 from osa.domain.ingest.model.ingester_record import IngesterRecord
 from osa.domain.ingest.port.repository import IngestRunRepository
 from osa.domain.ingest.port.storage import IngestStoragePort
+from osa.domain.ingest.service.ingest import IngestService
 from osa.domain.record.model.draft import RecordDraft
 from osa.domain.record.service import RecordService
 from osa.domain.shared.error import NotFoundError
@@ -35,6 +36,7 @@ class PublishBatch(EventHandler[HookBatchCompleted]):
     feature_storage: FeatureStoragePort
     outbox: Outbox
     ingest_storage: IngestStoragePort
+    ingest_service: IngestService
 
     async def handle(self, event: HookBatchCompleted) -> None:
         ingest_run = await self.ingest_repo.get(event.ingest_run_id)
@@ -175,6 +177,15 @@ async def handle(self, event: HookBatchCompleted) -> None:
                 ingest_run_id=event.ingest_run_id,
             )
 
+    async def on_exhausted(self, event: HookBatchCompleted) -> None:
+        """Called when publish retries are exhausted — account for the failed batch."""
+        log.error(
+            "batch {batch_index} publish retries exhausted",
+            batch_index=event.batch_index,
+            ingest_run_id=event.ingest_run_id,
+        )
+        await self.ingest_service.fail_batch(event.ingest_run_id)
+
 
 async def _get_passed_records(
     ingester_records: list[IngesterRecord],
diff --git a/server/tests/unit/domain/ingest/test_publish_batch.py b/server/tests/unit/domain/ingest/test_publish_batch.py
new file mode 100644
index 0000000..c9c2368
--- /dev/null
+++ b/server/tests/unit/domain/ingest/test_publish_batch.py
@@ -0,0 +1,82 @@
+"""Tests for PublishBatch.on_exhausted — ensures failed batches are accounted for."""
+
+from datetime import UTC, datetime
+from unittest.mock import AsyncMock
+from uuid import uuid4
+
+import pytest
+
+from osa.domain.ingest.event.events import HookBatchCompleted
+from osa.domain.ingest.handler.publish_batch import PublishBatch
+from osa.domain.ingest.model.ingest_run import IngestRun, IngestRunId, IngestStatus
+from osa.domain.shared.event import EventId
+
+
+def _make_event(
+    ingest_run_id: str = "run-1",
+    batch_index: int = 0,
+) -> HookBatchCompleted:
+    return HookBatchCompleted(
+        id=EventId(uuid4()),
+        ingest_run_id=IngestRunId(ingest_run_id),
+        batch_index=batch_index,
+    )
+
+
+def _make_ingest_run(
+    *,
+    ingest_run_id: str = "run-1",
+    batches_ingested: int = 3,
+    batches_completed: int = 1,
+    batches_failed: int = 0,
+    ingestion_finished: bool = True,
+) -> IngestRun:
+    return IngestRun(
+        id=IngestRunId(ingest_run_id),
+        convention_srn="urn:osa:localhost:conv:test@1.0.0",
+        status=IngestStatus.RUNNING,
+        batches_ingested=batches_ingested,
+        batches_completed=batches_completed,
+        batches_failed=batches_failed,
+        ingestion_finished=ingestion_finished,
+        batch_size=100,
+        started_at=datetime.now(UTC),
+    )
+
+
+def _make_handler() -> PublishBatch:
+    ingest_service = AsyncMock()
+    ingest_service.fail_batch = AsyncMock()
+
+    return PublishBatch(
+        ingest_repo=AsyncMock(),
+        convention_service=AsyncMock(),
+        record_service=AsyncMock(),
+        feature_storage=AsyncMock(),
+        outbox=AsyncMock(),
+        ingest_storage=AsyncMock(),
+        ingest_service=ingest_service,
+    )
+
+
+class TestPublishBatchOnExhausted:
+    @pytest.mark.asyncio
+    async def test_on_exhausted_calls_fail_batch(self) -> None:
+        """When retries are exhausted, the batch must be accounted for as failed."""
+        handler = _make_handler()
+        event = _make_event()
+
+        await handler.on_exhausted(event)
+
+        handler.ingest_service.fail_batch.assert_called_once_with(
+            IngestRunId("run-1"),
+        )
+
+    @pytest.mark.asyncio
+    async def test_on_exhausted_exists(self) -> None:
+        """PublishBatch must override on_exhausted (not rely on base class no-op)."""
+        assert hasattr(PublishBatch, "on_exhausted")
+        # Verify it's not the base class default
+        from osa.domain.shared.event import EventHandler
+
+        assert PublishBatch.on_exhausted is not EventHandler.on_exhausted

From e5361d92ba75c6279bc7b023f2fdea6d56bdc0dc Mon Sep 17 00:00:00 2001
From: Rory Byrne <rory@rory.bio>
Date: Sun, 12 Apr 2026 22:34:02 +0100
Subject: [PATCH 11/13] feat: handle OOM exhaustion in RunHooks to allow
 partial results

Add OOMError handling to RunHooks handler to emit HookBatchCompleted
even when memory is exhausted, allowing PublishBatch to process
records that passed validation before OOM occurred.
---
 server/osa/domain/ingest/handler/run_hooks.py |  16 ++-
 .../unit/domain/ingest/test_run_hooks.py      | 118 ++++++++++++++++++
 2 files changed, 133 insertions(+), 1 deletion(-)
 create mode 100644 server/tests/unit/domain/ingest/test_run_hooks.py

diff --git a/server/osa/domain/ingest/handler/run_hooks.py b/server/osa/domain/ingest/handler/run_hooks.py
index 658481b..4794092 100644
--- a/server/osa/domain/ingest/handler/run_hooks.py
+++ b/server/osa/domain/ingest/handler/run_hooks.py
@@ -1,5 +1,7 @@
 """RunHooks — runs hook containers on an ingester batch."""
 
+from osa.domain.validation.model import HookResult
+
 from pathlib import Path
 from uuid import uuid4
 
@@ -9,7 +11,7 @@
 from osa.domain.ingest.port.repository import IngestRunRepository
 from osa.domain.ingest.port.storage import IngestStoragePort
 from osa.domain.ingest.service.ingest import IngestService
-from osa.domain.shared.error import NotFoundError, PermanentError
+from osa.domain.shared.error import NotFoundError, OOMError, PermanentError
 from osa.domain.shared.event import EventHandler, EventId
 from osa.domain.shared.model.srn import ConventionSRN
 from osa.domain.shared.outbox import Outbox
@@ -83,12 +85,24 @@ async def handle(self, event: IngesterBatchReady) -> None:
             )
 
         # Run all hooks via HookService
+        results: list[HookResult] = []
         try:
             results = await self.hook_service.run_hooks_for_batch(
                 hooks=convention.hooks,
                 inputs=inputs,
                 work_dirs=work_dirs,
             )
+        except OOMError as e:
+            # OOM exhaustion after retries — HookService already wrote outcomes
+            # (passed + errored) to disk. Fall through to emit HookBatchCompleted
+            # so PublishBatch can publish the records that DID pass.
+            log.warn(
+                "[{short_id}] batch {batch_index} OOM exhausted, publishing partial results: {error}",
+                short_id=event.ingest_run_id[:8],
+                batch_index=event.batch_index,
+                error=str(e),
+                ingest_run_id=event.ingest_run_id,
+            )
         except PermanentError as e:
             log.error(
                 "[{short_id}] batch {batch_index} permanently failed: {error}",
diff --git a/server/tests/unit/domain/ingest/test_run_hooks.py b/server/tests/unit/domain/ingest/test_run_hooks.py
new file mode 100644
index 0000000..fbea714
--- /dev/null
+++ b/server/tests/unit/domain/ingest/test_run_hooks.py
@@ -0,0 +1,118 @@
+"""Tests for RunHooks — OOM exhaustion should still emit HookBatchCompleted."""
+
+from unittest.mock import AsyncMock
+from uuid import uuid4
+
+import pytest
+
+from osa.domain.ingest.event.events import HookBatchCompleted, IngesterBatchReady
+from osa.domain.ingest.handler.run_hooks import RunHooks
+from osa.domain.ingest.model.ingest_run import IngestRun, IngestRunId, IngestStatus
+from osa.domain.shared.error import OOMError, PermanentError
+from osa.domain.shared.event import EventId
+from osa.domain.shared.model.hook import HookDefinition, OciConfig, OciLimits, TableFeatureSpec
+
+
+def _make_hook(name: str = "pockets") -> HookDefinition:
+    return HookDefinition(
+        name=name,
+        runtime=OciConfig(
+            image="ghcr.io/test/pockets:v1",
+            digest="sha256:abc123",
+            limits=OciLimits(memory="1g"),
+        ),
+        feature=TableFeatureSpec(cardinality="one", columns=[]),
+    )
+
+
+def _make_event(
+    ingest_run_id: str = "run-1",
+    batch_index: int = 0,
+) -> IngesterBatchReady:
+    return IngesterBatchReady(
+        id=EventId(uuid4()),
+        ingest_run_id=IngestRunId(ingest_run_id),
+        batch_index=batch_index,
+        has_more=False,
+    )
+
+
+def _make_convention():
+    conv = AsyncMock()
+    conv.hooks = [_make_hook()]
+    return conv
+
+
+def _make_handler(*, hook_service_side_effect=None) -> RunHooks:
+    ingest_repo = AsyncMock()
+    ingest_repo.get.return_value = IngestRun(
+        id=IngestRunId("run-1"),
+        convention_srn="urn:osa:localhost:conv:test@1.0.0",
+        status=IngestStatus.RUNNING,
+        batch_size=100,
+        started_at=__import__("datetime").datetime.now(__import__("datetime").UTC),
+    )
+
+    convention_service = AsyncMock()
+    convention_service.get_convention.return_value = _make_convention()
+
+    ingest_storage = AsyncMock()
+    ingest_storage.read_records.return_value = [
+        {"source_id": "rec-1", "metadata": {}, "files": []},
+    ]
+    ingest_storage.batch_files_dir.return_value = __import__("pathlib").Path("/tmp/files")
+    ingest_storage.hook_work_dir.return_value = __import__("pathlib").Path("/tmp/work")
+
+    hook_service = AsyncMock()
+    if hook_service_side_effect:
+        hook_service.run_hooks_for_batch.side_effect = hook_service_side_effect
+
+    return RunHooks(
+        ingest_repo=ingest_repo,
+        ingest_service=AsyncMock(),
+        convention_service=convention_service,
+        hook_service=hook_service,
+        outbox=AsyncMock(),
+        ingest_storage=ingest_storage,
+    )
+
+
+class TestRunHooksOOMExhaustion:
+    @pytest.mark.asyncio
+    async def test_oom_exhaustion_emits_hook_batch_completed(self) -> None:
+        """OOM exhaustion should still emit HookBatchCompleted so passed records get published."""
+        handler = _make_handler(hook_service_side_effect=OOMError("OOM after 3 retries"))
+        event = _make_event()
+
+        await handler.handle(event)
+
+        # HookBatchCompleted should be emitted (not swallowed)
+        emitted_events = [call[0][0] for call in handler.outbox.append.call_args_list]
+        assert any(isinstance(e, HookBatchCompleted) for e in emitted_events), (
+            "HookBatchCompleted should be emitted on OOM exhaustion "
+            "so PublishBatch can publish records that passed"
+        )
+
+    @pytest.mark.asyncio
+    async def test_oom_exhaustion_does_not_fail_batch(self) -> None:
+        """OOM exhaustion should not call _fail_batch — the batch has partial results."""
+        handler = _make_handler(hook_service_side_effect=OOMError("OOM after 3 retries"))
+        event = _make_event()
+
+        await handler.handle(event)
+
+        handler.ingest_service.fail_batch.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_permanent_error_still_fails_batch(self) -> None:
+        """Non-OOM PermanentError should still fail the batch (no partial results)."""
+        error = PermanentError("image pull failed")
+        handler = _make_handler(hook_service_side_effect=error)
+        event = _make_event()
+
+        await handler.handle(event)
+
+        handler.ingest_service.fail_batch.assert_called_once()
+        # HookBatchCompleted should NOT be emitted
+        emitted_events = [call[0][0] for call in handler.outbox.append.call_args_list]
+        assert not any(isinstance(e, HookBatchCompleted) for e in emitted_events)

From c759d1d12f2521de07e1d360527de6c349e9cb39 Mon Sep 17 00:00:00 2001
From: Rory Byrne <rory@rory.bio>
Date: Wed, 15 Apr 2026 09:31:59 +0100
Subject: [PATCH 12/13] fix: only trigger backpressure on genuinely
 unschedulable pods

has_capacity() was checking status.phase=Pending which matches all pods
in normal startup (image pull, node assignment), causing false
backpressure that throttled ingestion to ~1 batch/60s. Now checks for
PodScheduled=False with reason=Unschedulable, which only fires when the
K8s scheduler genuinely can't place a pod.
---
 .../osa/infrastructure/k8s/ingester_runner.py | 14 ++-
 .../k8s/test_k8s_ingester_runner.py           | 90 +++++++++++++++++++
 2 files changed, 102 insertions(+), 2 deletions(-)

diff --git a/server/osa/infrastructure/k8s/ingester_runner.py b/server/osa/infrastructure/k8s/ingester_runner.py
index 4cfe493..721739c 100644
--- a/server/osa/infrastructure/k8s/ingester_runner.py
+++ b/server/osa/infrastructure/k8s/ingester_runner.py
@@ -60,13 +60,23 @@ def _s3_prefix(self, work_dir: Path, subdir: str) -> str:
         return f"{relative_path(work_dir, self._config.data_mount_path)}/{subdir}"
 
     async def has_capacity(self) -> bool:
-        """Check for pending (unschedulable) pods in the namespace."""
+        """Check for unschedulable pods in the namespace.
+
+        Only triggers backpressure when a pod has PodScheduled=False with
+        reason=Unschedulable, meaning the cluster genuinely can't place it.
+        Pods that are Pending but actively scheduling (image pull, node
+        assignment) are normal and should not block ingestion.
+        """
         namespace = self._config.namespace
         try:
             pod_list = await self._core_api.list_namespaced_pod(
                 namespace, field_selector="status.phase=Pending"
             )
-            return len(pod_list.items) == 0
+            for pod in pod_list.items:
+                for condition in pod.status.conditions or []:
+                    if condition.type == "PodScheduled" and condition.reason == "Unschedulable":
+                        return False
+            return True
         except Exception as e:
             logger.warn(
                 "Failed to check cluster capacity: {error} — assuming capacity", error=str(e)
diff --git a/server/tests/unit/infrastructure/k8s/test_k8s_ingester_runner.py b/server/tests/unit/infrastructure/k8s/test_k8s_ingester_runner.py
index c6e54b6..0341814 100644
--- a/server/tests/unit/infrastructure/k8s/test_k8s_ingester_runner.py
+++ b/server/tests/unit/infrastructure/k8s/test_k8s_ingester_runner.py
@@ -59,6 +59,96 @@ def _make_runner(config: K8sConfig | None = None) -> K8sIngesterRunner:
     return K8sIngesterRunner(api_client=api_client, config=config or _make_config(), s3=s3)
 
 
+# ---------------------------------------------------------------------------
+# Backpressure capacity check
+# ---------------------------------------------------------------------------
+
+
+def _make_pod(*, phase: str = "Pending", conditions: list | None = None) -> MagicMock:
+    pod = MagicMock()
+    pod.status.phase = phase
+    pod.status.conditions = conditions
+    return pod
+
+
+def _make_condition(*, type_: str, reason: str, status: str = "False") -> MagicMock:
+    cond = MagicMock()
+    cond.type = type_
+    cond.reason = reason
+    cond.status = status
+    return cond
+
+
+class TestHasCapacity:
+    @pytest.mark.asyncio
+    async def test_no_pending_pods_returns_true(self):
+        runner = _make_runner()
+        runner._core_api = AsyncMock()
+        pod_list = MagicMock()
+        pod_list.items = []
+        runner._core_api.list_namespaced_pod.return_value = pod_list
+
+        assert await runner.has_capacity() is True
+
+    @pytest.mark.asyncio
+    async def test_pending_but_schedulable_returns_true(self):
+        """Pods in Pending that are actively scheduling (no Unschedulable condition)
+        should NOT trigger backpressure — they'll be Running in seconds."""
+        runner = _make_runner()
+        runner._core_api = AsyncMock()
+
+        # Pod is Pending with PodScheduled=True (normal startup)
+        pod = _make_pod(
+            phase="Pending",
+            conditions=[_make_condition(type_="PodScheduled", reason="", status="True")],
+        )
+        pod_list = MagicMock()
+        pod_list.items = [pod]
+        runner._core_api.list_namespaced_pod.return_value = pod_list
+
+        assert await runner.has_capacity() is True
+
+    @pytest.mark.asyncio
+    async def test_pending_no_conditions_returns_true(self):
+        """Pods in Pending with no conditions yet (just created) should not block."""
+        runner = _make_runner()
+        runner._core_api = AsyncMock()
+
+        pod = _make_pod(phase="Pending", conditions=None)
+        pod_list = MagicMock()
+        pod_list.items = [pod]
+        runner._core_api.list_namespaced_pod.return_value = pod_list
+
+        assert await runner.has_capacity() is True
+
+    @pytest.mark.asyncio
+    async def test_unschedulable_pod_returns_false(self):
+        """A pod with PodScheduled=False reason=Unschedulable means the cluster is full."""
+        runner = _make_runner()
+        runner._core_api = AsyncMock()
+
+        pod = _make_pod(
+            phase="Pending",
+            conditions=[
+                _make_condition(type_="PodScheduled", reason="Unschedulable", status="False"),
+            ],
+        )
+        pod_list = MagicMock()
+        pod_list.items = [pod]
+        runner._core_api.list_namespaced_pod.return_value = pod_list
+
+        assert await runner.has_capacity() is False
+
+    @pytest.mark.asyncio
+    async def test_api_failure_assumes_capacity(self):
+        """If the K8s API fails, assume capacity (don't block on transient API errors)."""
+        runner = _make_runner()
+        runner._core_api = AsyncMock()
+        runner._core_api.list_namespaced_pod.side_effect = Exception("API timeout")
+
+        assert await runner.has_capacity() is True
+
+
 # ---------------------------------------------------------------------------
 # Job spec differences (T021)
 # ---------------------------------------------------------------------------

From f6902127e48636201963e869604ff4a0091874e3 Mon Sep 17 00:00:00 2001
From: Rory Byrne <rory@rory.bio>
Date: Wed, 15 Apr 2026 13:26:08 +0100
Subject: [PATCH 13/13] fix: guard on_exhausted errors and consolidate
 completion logic

Two fixes:

1. Worker: wrap on_exhausted() in try/except so mark_failed always runs.
   Previously, if on_exhausted raised (e.g., DB outage during fail_batch),
   mark_failed was skipped and the delivery stayed claimed indefinitely.

2. PublishBatch: delegate completion to IngestService.complete_batch()
   instead of duplicating the completion check and IngestCompleted emission.
   IngestCompleted is now emitted from exactly one place.
---
 .../domain/ingest/handler/publish_batch.py    | 30 +-------
 server/osa/domain/ingest/service/ingest.py    | 17 +++++
 server/osa/infrastructure/event/worker.py     | 27 ++++++-
 .../unit/domain/ingest/test_publish_batch.py  | 29 +-------
 .../event/test_worker_exhaustion.py           | 72 +++++++++++++++++++
 5 files changed, 118 insertions(+), 57 deletions(-)
 create mode 100644 server/tests/unit/infrastructure/event/test_worker_exhaustion.py

diff --git a/server/osa/domain/ingest/handler/publish_batch.py b/server/osa/domain/ingest/handler/publish_batch.py
index 4fe8edd..e179bd5 100644
--- a/server/osa/domain/ingest/handler/publish_batch.py
+++ b/server/osa/domain/ingest/handler/publish_batch.py
@@ -1,6 +1,5 @@
 """PublishBatch — reads hook outputs, bulk-publishes passing records."""
 
-from datetime import UTC, datetime
 from uuid import uuid4
 
 from osa.domain.deposition.service.convention import ConventionService
@@ -8,9 +7,7 @@
 from osa.domain.ingest.event.events import (
     HookBatchCompleted,
     IngestBatchPublished,
-    IngestCompleted,
 )
-from osa.domain.ingest.model.ingest_run import IngestStatus
 from osa.domain.ingest.model.ingester_record import IngesterRecord
 from osa.domain.ingest.port.repository import IngestRunRepository
 from osa.domain.ingest.port.storage import IngestStoragePort
@@ -151,31 +148,8 @@ async def handle(self, event: HookBatchCompleted) -> None:
                     )
                 )
 
-        # Update counters atomically
-        updated = await self.ingest_repo.increment_completed(
-            event.ingest_run_id,
-            published_count=published_count,
-        )
-
-        # Check completion condition
-        if updated.is_complete and updated.status == IngestStatus.RUNNING:
-            updated.check_completion(datetime.now(UTC))
-            await self.ingest_repo.save(updated)
-
-            await self.outbox.append(
-                IngestCompleted(
-                    id=EventId(uuid4()),
-                    ingest_run_id=event.ingest_run_id,
-                    total_published=updated.published_count,
-                )
-            )
-            short_id = event.ingest_run_id[:8]
-            log.info(
-                "[{short_id}] COMPLETE: {total_published} records published",
-                short_id=short_id,
-                total_published=updated.published_count,
-                ingest_run_id=event.ingest_run_id,
-            )
+        # Update counters and check completion via service
+        await self.ingest_service.complete_batch(event.ingest_run_id, published_count)
 
     async def on_exhausted(self, event: HookBatchCompleted) -> None:
         """Called when publish retries are exhausted — account for the failed batch."""
diff --git a/server/osa/domain/ingest/service/ingest.py b/server/osa/domain/ingest/service/ingest.py
index 7575915..fd091af 100644
--- a/server/osa/domain/ingest/service/ingest.py
+++ b/server/osa/domain/ingest/service/ingest.py
@@ -96,6 +96,17 @@ async def start_ingest(
         )
         return ingest_run
 
+    async def complete_batch(self, ingest_run_id: IngestRunId, published_count: int) -> None:
+        """Account for a successfully processed batch.
+
+        Increments batches_completed and published_count atomically,
+        then checks the completion condition.
+        """
+        ingest_run = await self.ingest_repo.increment_completed(
+            ingest_run_id, published_count=published_count
+        )
+        await self._check_completion(ingest_run)
+
     async def fail_batch(self, ingest_run_id: IngestRunId) -> None:
         """Account for a batch that permanently failed hook processing.
 
@@ -132,3 +143,9 @@ async def _check_completion(self, ingest_run: IngestRun) -> None:
                 total_published=ingest_run.published_count,
             )
         )
+        log.info(
+            "[{short_id}] COMPLETE: {total_published} records published",
+            short_id=str(ingest_run.id)[:8],
+            total_published=ingest_run.published_count,
+            ingest_run_id=ingest_run.id,
+        )
diff --git a/server/osa/infrastructure/event/worker.py b/server/osa/infrastructure/event/worker.py
index 733db35..202abf2 100644
--- a/server/osa/infrastructure/event/worker.py
+++ b/server/osa/infrastructure/event/worker.py
@@ -209,7 +209,14 @@ async def _poll_once(self) -> bool:
                             name=self.name,
                             error=str(e),
                         )
-                        await handler.on_exhausted(delivery.event)
+                        try:
+                            await handler.on_exhausted(delivery.event)
+                        except Exception as exhausted_err:
+                            logger.error(
+                                "Worker '{name}' on_exhausted failed: {error}",
+                                name=self.name,
+                                error=str(exhausted_err),
+                            )
                         await outbox.mark_failed(delivery.id, str(e))
                     else:
                         backoff_seconds = min(300, 60 * (2**delivery.retry_count))
@@ -238,7 +245,14 @@ async def _poll_once(self) -> bool:
                     error=str(e),
                 )
                 for delivery in result.deliveries:
-                    await handler.on_exhausted(delivery.event)
+                    try:
+                        await handler.on_exhausted(delivery.event)
+                    except Exception as exhausted_err:
+                        logger.error(
+                            "Worker '{name}' on_exhausted failed: {error}",
+                            name=self.name,
+                            error=str(exhausted_err),
+                        )
                     await outbox.mark_failed(delivery.id, str(e))
 
             except Exception as e:
@@ -252,7 +266,14 @@ async def _poll_once(self) -> bool:
                 for delivery in result.deliveries:
                     exhausted = delivery.retry_count + 1 >= self._max_retries
                     if exhausted:
-                        await handler.on_exhausted(delivery.event)
+                        try:
+                            await handler.on_exhausted(delivery.event)
+                        except Exception as exhausted_err:
+                            logger.error(
+                                "Worker '{name}' on_exhausted failed: {error}",
+                                name=self.name,
+                                error=str(exhausted_err),
+                            )
                         await outbox.mark_failed(delivery.id, str(e))
                     else:
                         backoff_seconds = min(30, 5 ** (delivery.retry_count + 1))
diff --git a/server/tests/unit/domain/ingest/test_publish_batch.py b/server/tests/unit/domain/ingest/test_publish_batch.py
index c9c2368..2f1479e 100644
--- a/server/tests/unit/domain/ingest/test_publish_batch.py
+++ b/server/tests/unit/domain/ingest/test_publish_batch.py
@@ -1,6 +1,5 @@
-"""Tests for PublishBatch.on_exhausted — ensures failed batches are accounted for."""
+"""Tests for PublishBatch — exhaustion handling and completion delegation."""
 
-from datetime import UTC, datetime
 from unittest.mock import AsyncMock
 from uuid import uuid4
 
@@ -8,7 +7,7 @@
 
 from osa.domain.ingest.event.events import HookBatchCompleted
 from osa.domain.ingest.handler.publish_batch import PublishBatch
-from osa.domain.ingest.model.ingest_run import IngestRun, IngestRunId, IngestStatus
+from osa.domain.ingest.model.ingest_run import IngestRunId
 from osa.domain.shared.event import EventId
 
 
@@ -23,30 +22,10 @@ def _make_event(
     )
 
 
-def _make_ingest_run(
-    *,
-    ingest_run_id: str = "run-1",
-    batches_ingested: int = 3,
-    batches_completed: int = 1,
-    batches_failed: int = 0,
-    ingestion_finished: bool = True,
-) -> IngestRun:
-    return IngestRun(
-        id=IngestRunId(ingest_run_id),
-        convention_srn="urn:osa:localhost:conv:test@1.0.0",
-        status=IngestStatus.RUNNING,
-        batches_ingested=batches_ingested,
-        batches_completed=batches_completed,
-        batches_failed=batches_failed,
-        ingestion_finished=ingestion_finished,
-        batch_size=100,
-        started_at=datetime.now(UTC),
-    )
-
-
 def _make_handler() -> PublishBatch:
     ingest_service = AsyncMock()
     ingest_service.fail_batch = AsyncMock()
+    ingest_service.complete_batch = AsyncMock()
 
     return PublishBatch(
         ingest_repo=AsyncMock(),
@@ -75,8 +54,6 @@ async def test_on_exhausted_calls_fail_batch(self) -> None:
     @pytest.mark.asyncio
     async def test_on_exhausted_exists(self) -> None:
         """PublishBatch must override on_exhausted (not rely on base class no-op)."""
-        assert hasattr(PublishBatch, "on_exhausted")
-        # Verify it's not the base class default
         from osa.domain.shared.event import EventHandler
 
         assert PublishBatch.on_exhausted is not EventHandler.on_exhausted
diff --git a/server/tests/unit/infrastructure/event/test_worker_exhaustion.py b/server/tests/unit/infrastructure/event/test_worker_exhaustion.py
new file mode 100644
index 0000000..ef10915
--- /dev/null
+++ b/server/tests/unit/infrastructure/event/test_worker_exhaustion.py
@@ -0,0 +1,72 @@
+"""Tests for worker on_exhausted error safety.
+
+Verifies that mark_failed is always called even when on_exhausted raises,
+by testing the Worker._poll_once flow end-to-end with a handler that
+raises PermanentError from handle() and RuntimeError from on_exhausted().
+"""
+
+from datetime import UTC, datetime
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock
+from uuid import uuid4
+
+import pytest
+
+from osa.domain.ingest.event.events import NextBatchRequested
+from osa.domain.ingest.handler.run_ingester import RunIngester
+from osa.domain.ingest.model.ingest_run import IngestRunId
+from osa.domain.shared.event import EventId
+from osa.infrastructure.event.worker import Worker
+
+
+class TestWorkerOnExhaustedErrorSafety:
+    @pytest.mark.asyncio
+    async def test_mark_failed_runs_even_if_on_exhausted_raises(self) -> None:
+        """If on_exhausted throws, mark_failed must still run."""
+        worker = Worker(RunIngester)
+
+        # Mock handler whose on_exhausted raises
+        handler = AsyncMock(spec=RunIngester)
+        handler.handle = AsyncMock(side_effect=Exception("something broke"))
+        handler.on_exhausted = AsyncMock(side_effect=RuntimeError("DB down during on_exhausted"))
+
+        outbox = AsyncMock()
+
+        # Create a delivery that has exhausted retries
+        event = NextBatchRequested(
+            id=EventId(uuid4()),
+            ingest_run_id=IngestRunId("run-1"),
+            convention_srn="urn:osa:localhost:conv:test@1.0.0",
+            batch_size=100,
+        )
+        delivery = MagicMock()
+        delivery.id = "delivery-1"
+        delivery.event = event
+        delivery.retry_count = 100  # exceeds __max_retries__
+
+        claim_result = MagicMock()
+        claim_result.deliveries = [delivery]
+        claim_result.events = [event]
+        claim_result.claimed_at = datetime.now(UTC)
+        outbox.claim.return_value = claim_result
+
+        # Wire up the DI scope mock
+        async def mock_get(t: type) -> Any:
+            if t is RunIngester:
+                return handler
+            return outbox
+
+        scope = AsyncMock()
+        scope.get = mock_get
+
+        container = MagicMock()
+        container.return_value.__aenter__ = AsyncMock(return_value=scope)
+        container.return_value.__aexit__ = AsyncMock(return_value=False)
+
+        worker.set_container(container)
+        await worker._poll_once()
+
+        # on_exhausted was called (and raised)
+        handler.on_exhausted.assert_called_once()
+        # mark_failed was STILL called despite the exception
+        outbox.mark_failed.assert_called_once_with("delivery-1", "something broke")