diff --git a/packages/producer/package.json b/packages/producer/package.json
index 9f8bb44d2..bf7333b47 100644
--- a/packages/producer/package.json
+++ b/packages/producer/package.json
@@ -37,6 +37,7 @@
     "perf:gate": "tsx src/perf-gate.ts",
     "check:runtime-conformance": "tsx src/runtime-conformance.ts",
     "benchmark": "tsx src/benchmark.ts",
+    "bench:hdr": "tsx src/benchmark.ts --tags hdr",
     "test": "tsx src/regression-harness.ts",
     "test:update": "tsx src/regression-harness.ts --update",
     "docker:build:test": "docker build -f ../../Dockerfile.test -t hyperframes-producer:test ../..",
diff --git a/packages/producer/src/benchmark.ts b/packages/producer/src/benchmark.ts
index fb3357f47..68d843eff 100644
--- a/packages/producer/src/benchmark.ts
+++ b/packages/producer/src/benchmark.ts
@@ -2,14 +2,21 @@
 /**
  * Render Benchmark
  *
- * Runs each test fixture multiple times and records per-stage timing.
- * Results are saved to producer/tests/perf/benchmark-results.json.
+ * Runs each test fixture multiple times and records per-stage timing
+ * plus peak heap/RSS memory. Results are saved to
+ * producer/tests/perf/benchmark-results.json.
  *
  * Usage:
  *   bun run benchmark                    # 3 runs per fixture (default)
  *   bun run benchmark -- --runs 5        # 5 runs per fixture
  *   bun run benchmark -- --only chat     # single fixture
  *   bun run benchmark -- --exclude-tags slow
+ *   bun run benchmark -- --tags hdr      # only fixtures tagged "hdr"
+ *   bun run bench:hdr                    # convenience: --tags hdr
+ *
+ * `--tags` and `--exclude-tags` may be passed together; a fixture must match
+ * at least one positive tag (when `--tags` is provided) AND must not match
+ * any excluded tag.
  */
 
 import {
@@ -52,6 +59,10 @@ interface FixtureResult {
   averages: {
     totalElapsedMs: number;
     captureAvgMs: number | null;
+    /** Average of per-run peak RSS in MiB. `null` if no run reported memory. */
+    peakRssMb: number | null;
+    /** Average of per-run peak heapUsed in MiB. `null` if no run reported memory. */
+    peakHeapUsedMb: number | null;
     stages: Record<string, number>;
   };
 }
@@ -64,9 +75,19 @@ interface BenchmarkResults {
   fixtures: FixtureResult[];
 }
 
-function parseArgs(): { runs: number; only: string | null; excludeTags: string[] } {
+interface BenchmarkArgs {
+  runs: number;
+  only: string | null;
+  /** Positive tag filter — fixture must include at least one. Empty = no positive filter. */
+  tags: string[];
+  /** Negative tag filter — fixture must not include any. Applied after `tags`. */
+  excludeTags: string[];
+}
+
+function parseArgs(): BenchmarkArgs {
   let runs = 3;
   let only: string | null = null;
+  const tags: string[] = [];
   const excludeTags: string[] = [];
 
   for (let i = 2; i < process.argv.length; i++) {
@@ -76,17 +97,21 @@ function parseArgs(): { runs: number; only: string | null; excludeTags: string[]
     } else if (process.argv[i] === "--only" && process.argv[i + 1]) {
       i++;
       only = process.argv[i] ?? null;
+    } else if (process.argv[i] === "--tags" && process.argv[i + 1]) {
+      i++;
+      tags.push(...(process.argv[i] ?? "").split(",").filter(Boolean));
     } else if (process.argv[i] === "--exclude-tags" && process.argv[i + 1]) {
       i++;
-      excludeTags.push(...(process.argv[i] ?? "").split(","));
+      excludeTags.push(...(process.argv[i] ?? "").split(",").filter(Boolean));
     }
   }
 
-  return { runs, only, excludeTags };
+  return { runs, only, tags, excludeTags };
 }
 
 function discoverFixtures(
   only: string | null,
+  tags: string[],
   excludeTags: string[],
 ): Array<{ id: string; dir: string; meta: TestMeta }> {
   const fixtures: Array<{ id: string; dir: string; meta: TestMeta }> = [];
@@ -101,7 +126,11 @@ function discoverFixtures(
     if (only && entry !== only) continue;
 
     const meta: TestMeta = JSON.parse(readFileSync(metaPath, "utf-8"));
-    if (excludeTags.length > 0 && meta.tags?.some((t) => excludeTags.includes(t))) continue;
+    const fixtureTags = meta.tags ?? [];
+    // Positive filter (--tags): if provided, fixture must match at least one.
+    if (tags.length > 0 && !fixtureTags.some((t) => tags.includes(t))) continue;
+    // Negative filter (--exclude-tags): always wins.
+    if (excludeTags.length > 0 && fixtureTags.some((t) => excludeTags.includes(t))) continue;
 
     fixtures.push({ id: entry, dir, meta });
   }
@@ -114,16 +143,35 @@ function avg(nums: number[]): number {
   return Math.round(nums.reduce((a, b) => a + b, 0) / nums.length);
 }
 
+/**
+ * Average a possibly-empty list of optional numbers. Returns `null` when no
+ * defined samples exist so the JSON output stays consistent with the
+ * `peakRssMb: number | null` shape the consumer (perf README, regression
+ * checks) expects — silently coercing missing memory data to `0` would mask
+ * older results regenerated against this harness.
+ */
+function avgOrNull(nums: Array<number | null | undefined>): number | null {
+  const filtered = nums.filter((n): n is number => typeof n === "number");
+  if (filtered.length === 0) return null;
+  return avg(filtered);
+}
+
 async function runBenchmark(): Promise<void> {
-  const { runs, only, excludeTags } = parseArgs();
-  const fixtures = discoverFixtures(only, excludeTags);
+  const { runs, only, tags, excludeTags } = parseArgs();
+  const fixtures = discoverFixtures(only, tags, excludeTags);
 
   if (fixtures.length === 0) {
-    console.error("No fixtures found");
+    console.error(
+      `No fixtures found${tags.length ? ` matching tags=[${tags.join(",")}]` : ""}` +
+        `${excludeTags.length ? ` excluding=[${excludeTags.join(",")}]` : ""}`,
+    );
     process.exit(1);
   }
 
-  console.log(`\n🏁 Benchmark: ${fixtures.length} fixture(s) × ${runs} run(s)\n`);
+  const filterDesc =
+    (tags.length ? ` tags=[${tags.join(",")}]` : "") +
+    (excludeTags.length ? ` exclude=[${excludeTags.join(",")}]` : "");
+  console.log(`\n🏁 Benchmark: ${fixtures.length} fixture(s) × ${runs} run(s)${filterDesc}\n`);
 
   const results: FixtureResult[] = [];
 
@@ -162,8 +210,12 @@ async function runBenchmark(): Promise<void> {
       if (job.perfSummary) {
         fixtureRuns.push({ run: r + 1, perfSummary: job.perfSummary });
         const ps = job.perfSummary;
+        const memDesc =
+          ps.peakRssMb != null || ps.peakHeapUsedMb != null
+            ? ` | peak RSS ${ps.peakRssMb ?? "?"}MiB heap ${ps.peakHeapUsedMb ?? "?"}MiB`
+            : "";
         console.log(
-          `  ✓ ${ps.totalElapsedMs}ms total | capture avg ${ps.captureAvgMs ?? "?"}ms/frame | ${ps.totalFrames} frames`,
+          `  ✓ ${ps.totalElapsedMs}ms total | capture avg ${ps.captureAvgMs ?? "?"}ms/frame | ${ps.totalFrames} frames${memDesc}`,
         );
       }
     }
@@ -192,19 +244,20 @@ async function runBenchmark(): Promise<void> {
       runs: fixtureRuns,
       averages: {
         totalElapsedMs: avg(fixtureRuns.map((r) => r.perfSummary.totalElapsedMs)),
-        captureAvgMs:
-          avg(
-            fixtureRuns
-              .filter((r) => r.perfSummary.captureAvgMs != null)
-              .map((r) => r.perfSummary.captureAvgMs ?? 0),
-          ) || null,
+        captureAvgMs: avgOrNull(fixtureRuns.map((r) => r.perfSummary.captureAvgMs)),
+        peakRssMb: avgOrNull(fixtureRuns.map((r) => r.perfSummary.peakRssMb)),
+        peakHeapUsedMb: avgOrNull(fixtureRuns.map((r) => r.perfSummary.peakHeapUsedMb)),
         stages: avgStages,
       },
     };
 
     results.push(fixtureResult);
 
-    console.log(`\n  Average: ${fixtureResult.averages.totalElapsedMs}ms total`);
+    const memLine =
+      fixtureResult.averages.peakRssMb != null || fixtureResult.averages.peakHeapUsedMb != null
+        ? ` | peak RSS ${fixtureResult.averages.peakRssMb ?? "?"}MiB heap ${fixtureResult.averages.peakHeapUsedMb ?? "?"}MiB`
+        : "";
+    console.log(`\n  Average: ${fixtureResult.averages.totalElapsedMs}ms total${memLine}`);
     for (const [stage, ms] of Object.entries(fixtureResult.averages.stages)) {
       const pct = Math.round((ms / fixtureResult.averages.totalElapsedMs) * 100);
       console.log(`    ${stage}: ${ms}ms (${pct}%)`);
@@ -226,7 +279,7 @@ async function runBenchmark(): Promise<void> {
 
   // Print summary table
   console.log("\n\n📊 BENCHMARK SUMMARY");
-  console.log("═".repeat(80));
+  console.log("═".repeat(95));
   console.log(
     "Fixture".padEnd(25) +
       "Total".padStart(10) +
@@ -234,9 +287,11 @@ async function runBenchmark(): Promise<void> {
       "Extract".padStart(10) +
       "Audio".padStart(10) +
       "Capture".padStart(10) +
-      "Encode".padStart(10),
+      "Encode".padStart(10) +
+      "PeakRSS".padStart(10) +
+      "PeakHeap".padStart(10),
   );
-  console.log("─".repeat(80));
+  console.log("─".repeat(95));
 
   for (const f of results) {
     const s = f.averages.stages;
@@ -247,11 +302,13 @@ async function runBenchmark(): Promise<void> {
         `${s.videoExtractMs ?? "-"}ms`.padStart(10) +
         `${s.audioProcessMs ?? "-"}ms`.padStart(10) +
         `${s.captureMs ?? "-"}ms`.padStart(10) +
-        `${s.encodeMs ?? "-"}ms`.padStart(10),
+        `${s.encodeMs ?? "-"}ms`.padStart(10) +
+        `${f.averages.peakRssMb ?? "-"}MiB`.padStart(10) +
+        `${f.averages.peakHeapUsedMb ?? "-"}MiB`.padStart(10),
     );
   }
 
-  console.log("═".repeat(80));
+  console.log("═".repeat(95));
   console.log(`\nResults saved to: ${outputPath}`);
 }
 
diff --git a/packages/producer/src/services/renderOrchestrator.ts b/packages/producer/src/services/renderOrchestrator.ts
index 54fcea821..308bb85fd 100644
--- a/packages/producer/src/services/renderOrchestrator.ts
+++ b/packages/producer/src/services/renderOrchestrator.ts
@@ -239,6 +239,23 @@ export interface RenderPerfSummary {
   stages: Record<string, number>;
   captureAvgMs?: number;
   capturePeakMs?: number;
+  /**
+   * Peak resident set size (RSS) observed during the render, in MiB.
+   *
+   * Sampled every 250ms by a process-wide poller; surfaces gross memory
+   * regressions (e.g. unbounded image-cache growth) that wall-clock numbers
+   * miss. Optional because callers can serialize older `RenderPerfSummary`
+   * shapes back into this type.
+   */
+  peakRssMb?: number;
+  /**
+   * Peak V8 heap used observed during the render, in MiB.
+   *
+   * Useful as a finer-grained complement to {@link peakRssMb} — RSS includes
+   * native ffmpeg/Chrome allocations, while heapUsed isolates JS-object growth
+   * inside the orchestrator. Optional for the same back-compat reason.
+   */
+  peakHeapUsedMb?: number;
   hdrDiagnostics?: HdrDiagnostics;
 }
 
@@ -963,6 +980,27 @@ export async function executeRenderJob(
   const chunkedEncodeSize = cfg.chunkSizeFrames;
   const enableStreamingEncode = cfg.enableStreamingEncode;
 
+  // Periodic memory sampler — surfaces peak RSS/heap so the benchmark harness
+  // can detect memory regressions (e.g. unbounded image-cache growth) that
+  // wall-clock numbers miss. Sampled every 250ms; the interval is `unref`'d so
+  // it never keeps the event loop alive on its own, and always cleared in the
+  // finally block below regardless of how the render exits.
+  let peakRssBytes = 0;
+  let peakHeapUsedBytes = 0;
+  const sampleMemory = (): void => {
+    try {
+      const m = process.memoryUsage();
+      if (m.rss > peakRssBytes) peakRssBytes = m.rss;
+      if (m.heapUsed > peakHeapUsedBytes) peakHeapUsedBytes = m.heapUsed;
+    } catch {
+      // Defensive: process.memoryUsage() shouldn't throw, but if it ever
+      // does we don't want to take down the render for a benchmark accessory.
+    }
+  };
+  sampleMemory();
+  const memSamplerInterval: NodeJS.Timeout = setInterval(sampleMemory, 250);
+  memSamplerInterval.unref?.();
+
   try {
     const assertNotAborted = () => {
       if (abortSignal?.aborted) {
@@ -2512,6 +2550,7 @@ export async function executeRenderJob(
     updateJobStatus(job, "complete", "Render complete", 100, onProgress);
 
     const totalElapsed = Date.now() - pipelineStart;
+    sampleMemory();
 
     const perfSummary: RenderPerfSummary = {
       renderId: job.id,
@@ -2533,6 +2572,8 @@ export async function executeRenderJob(
           : undefined,
       captureAvgMs:
         totalFrames > 0 ? Math.round((perfStages.captureMs ?? 0) / totalFrames) : undefined,
+      peakRssMb: Math.round(peakRssBytes / (1024 * 1024)),
+      peakHeapUsedMb: Math.round(peakHeapUsedBytes / (1024 * 1024)),
     };
     job.perfSummary = perfSummary;
     if (job.config.debug) {
@@ -2666,5 +2707,7 @@ export async function executeRenderJob(
 
     if (restoreLogger) restoreLogger();
     throw error;
+  } finally {
+    clearInterval(memSamplerInterval);
   }
 }
diff --git a/packages/producer/tests/perf/README.md b/packages/producer/tests/perf/README.md
new file mode 100644
index 000000000..543fe840d
--- /dev/null
+++ b/packages/producer/tests/perf/README.md
@@ -0,0 +1,100 @@
+# Producer Performance Benchmarks
+
+End-to-end render benchmark harness driven by `src/benchmark.ts`. Discovers
+fixtures under `packages/producer/tests/<fixture>/` (any directory with a
+`meta.json`), runs them through the full producer pipeline, and emits per-stage
+timing plus peak memory metrics into `benchmark-results.json`.
+
+The harness is deliberately lightweight — it doesn't enforce thresholds. It's
+designed for **regression spotting**: capture a baseline, change something,
+re-run, eyeball the diff. For pass/fail thresholds see `tests/perf/baseline.json`
+and the perf-regression checks in the integration test suite.
+
+## Quick start
+
+```bash
+# Run every non-slow fixture once
+cd packages/producer
+bun run benchmark
+
+# HDR-only baseline (PQ + HLG fixtures, ~50s on M-series Macs)
+bun run bench:hdr
+
+# Average a fixture across multiple runs
+bunx tsx src/benchmark.ts --tags hdr --runs 3
+
+# Just the PQ regression
+bunx tsx src/benchmark.ts --only hdr-regression
+
+# Skip slow fixtures explicitly (default behavior; here for clarity)
+bunx tsx src/benchmark.ts --exclude-tags slow
+```
+
+Results are written to
+`packages/producer/tests/perf/benchmark-results.json` and a summary table is
+printed to stdout.
+
+## CLI flags
+
+| Flag | Description |
+| --- | --- |
+| `--runs N` | Run each fixture `N` times and average (default: 1). |
+| `--only <id>` | Run a single fixture by directory name. |
+| `--tags a,b` | **Positive** filter: only fixtures whose `meta.json#tags` contains *any* of the listed tags. |
+| `--exclude-tags a,b` | **Negative** filter: skip fixtures with any matching tag. Defaults to `slow`. |
+
+`--tags` and `--exclude-tags` apply independently — a fixture must match the
+positive filter (if any) **and** must not match the negative filter.
+
+## Reading the output
+
+Each fixture row prints averaged stage timings plus peak memory:
+
+```
+Fixture                       Total   Compile   Extract     Audio   Capture    Encode   PeakRSS  PeakHeap
+hdr-hlg-regression          11549ms     187ms     520ms      36ms    8373ms    2394ms    227MiB     69MiB
+hdr-regression              34452ms      94ms    1268ms      48ms   27034ms    5914ms    272MiB    118MiB
+```
+
+- **Total** — wall-clock time from job submission to mux-complete.
+- **Capture** — frame extraction + composition + alpha blit (HDR path).
+- **Encode** — chunked or streaming HDR encoder time (HEVC Main10 for HDR).
+- **PeakRSS / PeakHeap** — sampled every 250ms inside `executeRenderJob` from
+  `process.memoryUsage()`; surfaces gross memory regressions (e.g. unbounded
+  image-cache growth) that wall-clock numbers miss. RSS includes native
+  ffmpeg/Chrome allocations; heap is JS-side V8 only.
+
+## HDR baseline (April 2026)
+
+Captured on macOS arm64 (M-series), Bun runtime, 1 worker, default config,
+single run. These are illustrative — re-baseline locally before comparing your
+own runs.
+
+| Fixture | Total | Capture | Encode | PeakRSS | PeakHeap | Notes |
+| --- | ---: | ---: | ---: | ---: | ---: | --- |
+| `hdr-hlg-regression` | 11.5s | 8.4s (72%) | 2.4s (21%) | 227 MiB | 69 MiB | 150 frames, 2 HLG sources |
+| `hdr-regression` | 34.5s | 27.0s (78%) | 5.9s (17%) | 272 MiB | 118 MiB | 600 frames, 9 PQ sources, shader transition |
+
+Capture dominates HDR runs (~72-78%). The second-biggest cost is HEVC Main10
+encode. Memory peaks scale with source count and resolution — the PQ
+regression's nine HDR sources push heap from ~70 MiB → ~120 MiB.
+
+When evaluating an HDR optimization (image cache, gated debug logging, etc.)
+the metric to watch first is **Capture** ms-per-frame:
+
+```
+hdr-regression: capture avg 45ms/frame
+hdr-hlg-regression: capture avg 56ms/frame
+```
+
+## When to re-baseline
+
+- After landing any change that touches `renderOrchestrator.ts`,
+  `streamingEncoder.ts`, the HDR alpha-blit path, or `frameDirCache.ts`.
+- Before opening a PR that claims a perf win — paste before/after numbers in
+  the PR description.
+- Quarterly, even without code changes, to track infra/dependency drift.
+
+The `bench:hdr` script is the recommended command for routine HDR perf checks
+because it filters out non-HDR fixtures (which can be 10× slower without
+contributing signal to HDR-specific work).