diff --git a/packages/producer/package.json b/packages/producer/package.json index 9f8bb44d2..bf7333b47 100644 --- a/packages/producer/package.json +++ b/packages/producer/package.json @@ -37,6 +37,7 @@ "perf:gate": "tsx src/perf-gate.ts", "check:runtime-conformance": "tsx src/runtime-conformance.ts", "benchmark": "tsx src/benchmark.ts", + "bench:hdr": "tsx src/benchmark.ts --tags hdr", "test": "tsx src/regression-harness.ts", "test:update": "tsx src/regression-harness.ts --update", "docker:build:test": "docker build -f ../../Dockerfile.test -t hyperframes-producer:test ../..", diff --git a/packages/producer/src/benchmark.ts b/packages/producer/src/benchmark.ts index fb3357f47..68d843eff 100644 --- a/packages/producer/src/benchmark.ts +++ b/packages/producer/src/benchmark.ts @@ -2,14 +2,21 @@ /** * Render Benchmark * - * Runs each test fixture multiple times and records per-stage timing. - * Results are saved to producer/tests/perf/benchmark-results.json. + * Runs each test fixture multiple times and records per-stage timing + * plus peak heap/RSS memory. Results are saved to + * producer/tests/perf/benchmark-results.json. * * Usage: * bun run benchmark # 3 runs per fixture (default) * bun run benchmark -- --runs 5 # 5 runs per fixture * bun run benchmark -- --only chat # single fixture * bun run benchmark -- --exclude-tags slow + * bun run benchmark -- --tags hdr # only fixtures tagged "hdr" + * bun run bench:hdr # convenience: --tags hdr + * + * `--tags` and `--exclude-tags` may be passed together; a fixture must match + * at least one positive tag (when `--tags` is provided) AND must not match + * any excluded tag. */ import { @@ -52,6 +59,10 @@ interface FixtureResult { averages: { totalElapsedMs: number; captureAvgMs: number | null; + /** Average of per-run peak RSS in MiB. `null` if no run reported memory. */ + peakRssMb: number | null; + /** Average of per-run peak heapUsed in MiB. `null` if no run reported memory. */ + peakHeapUsedMb: number | null; stages: Record; }; } @@ -64,9 +75,19 @@ interface BenchmarkResults { fixtures: FixtureResult[]; } -function parseArgs(): { runs: number; only: string | null; excludeTags: string[] } { +interface BenchmarkArgs { + runs: number; + only: string | null; + /** Positive tag filter — fixture must include at least one. Empty = no positive filter. */ + tags: string[]; + /** Negative tag filter — fixture must not include any. Applied after `tags`. */ + excludeTags: string[]; +} + +function parseArgs(): BenchmarkArgs { let runs = 3; let only: string | null = null; + const tags: string[] = []; const excludeTags: string[] = []; for (let i = 2; i < process.argv.length; i++) { @@ -76,17 +97,21 @@ function parseArgs(): { runs: number; only: string | null; excludeTags: string[] } else if (process.argv[i] === "--only" && process.argv[i + 1]) { i++; only = process.argv[i] ?? null; + } else if (process.argv[i] === "--tags" && process.argv[i + 1]) { + i++; + tags.push(...(process.argv[i] ?? "").split(",").filter(Boolean)); } else if (process.argv[i] === "--exclude-tags" && process.argv[i + 1]) { i++; - excludeTags.push(...(process.argv[i] ?? "").split(",")); + excludeTags.push(...(process.argv[i] ?? "").split(",").filter(Boolean)); } } - return { runs, only, excludeTags }; + return { runs, only, tags, excludeTags }; } function discoverFixtures( only: string | null, + tags: string[], excludeTags: string[], ): Array<{ id: string; dir: string; meta: TestMeta }> { const fixtures: Array<{ id: string; dir: string; meta: TestMeta }> = []; @@ -101,7 +126,11 @@ function discoverFixtures( if (only && entry !== only) continue; const meta: TestMeta = JSON.parse(readFileSync(metaPath, "utf-8")); - if (excludeTags.length > 0 && meta.tags?.some((t) => excludeTags.includes(t))) continue; + const fixtureTags = meta.tags ?? []; + // Positive filter (--tags): if provided, fixture must match at least one. + if (tags.length > 0 && !fixtureTags.some((t) => tags.includes(t))) continue; + // Negative filter (--exclude-tags): always wins. + if (excludeTags.length > 0 && fixtureTags.some((t) => excludeTags.includes(t))) continue; fixtures.push({ id: entry, dir, meta }); } @@ -114,16 +143,35 @@ function avg(nums: number[]): number { return Math.round(nums.reduce((a, b) => a + b, 0) / nums.length); } +/** + * Average a possibly-empty list of optional numbers. Returns `null` when no + * defined samples exist so the JSON output stays consistent with the + * `peakRssMb: number | null` shape the consumer (perf README, regression + * checks) expects — silently coercing missing memory data to `0` would mask + * older results regenerated against this harness. + */ +function avgOrNull(nums: Array): number | null { + const filtered = nums.filter((n): n is number => typeof n === "number"); + if (filtered.length === 0) return null; + return avg(filtered); +} + async function runBenchmark(): Promise { - const { runs, only, excludeTags } = parseArgs(); - const fixtures = discoverFixtures(only, excludeTags); + const { runs, only, tags, excludeTags } = parseArgs(); + const fixtures = discoverFixtures(only, tags, excludeTags); if (fixtures.length === 0) { - console.error("No fixtures found"); + console.error( + `No fixtures found${tags.length ? ` matching tags=[${tags.join(",")}]` : ""}` + + `${excludeTags.length ? ` excluding=[${excludeTags.join(",")}]` : ""}`, + ); process.exit(1); } - console.log(`\nšŸ Benchmark: ${fixtures.length} fixture(s) Ɨ ${runs} run(s)\n`); + const filterDesc = + (tags.length ? ` tags=[${tags.join(",")}]` : "") + + (excludeTags.length ? ` exclude=[${excludeTags.join(",")}]` : ""); + console.log(`\nšŸ Benchmark: ${fixtures.length} fixture(s) Ɨ ${runs} run(s)${filterDesc}\n`); const results: FixtureResult[] = []; @@ -162,8 +210,12 @@ async function runBenchmark(): Promise { if (job.perfSummary) { fixtureRuns.push({ run: r + 1, perfSummary: job.perfSummary }); const ps = job.perfSummary; + const memDesc = + ps.peakRssMb != null || ps.peakHeapUsedMb != null + ? ` | peak RSS ${ps.peakRssMb ?? "?"}MiB heap ${ps.peakHeapUsedMb ?? "?"}MiB` + : ""; console.log( - ` āœ“ ${ps.totalElapsedMs}ms total | capture avg ${ps.captureAvgMs ?? "?"}ms/frame | ${ps.totalFrames} frames`, + ` āœ“ ${ps.totalElapsedMs}ms total | capture avg ${ps.captureAvgMs ?? "?"}ms/frame | ${ps.totalFrames} frames${memDesc}`, ); } } @@ -192,19 +244,20 @@ async function runBenchmark(): Promise { runs: fixtureRuns, averages: { totalElapsedMs: avg(fixtureRuns.map((r) => r.perfSummary.totalElapsedMs)), - captureAvgMs: - avg( - fixtureRuns - .filter((r) => r.perfSummary.captureAvgMs != null) - .map((r) => r.perfSummary.captureAvgMs ?? 0), - ) || null, + captureAvgMs: avgOrNull(fixtureRuns.map((r) => r.perfSummary.captureAvgMs)), + peakRssMb: avgOrNull(fixtureRuns.map((r) => r.perfSummary.peakRssMb)), + peakHeapUsedMb: avgOrNull(fixtureRuns.map((r) => r.perfSummary.peakHeapUsedMb)), stages: avgStages, }, }; results.push(fixtureResult); - console.log(`\n Average: ${fixtureResult.averages.totalElapsedMs}ms total`); + const memLine = + fixtureResult.averages.peakRssMb != null || fixtureResult.averages.peakHeapUsedMb != null + ? ` | peak RSS ${fixtureResult.averages.peakRssMb ?? "?"}MiB heap ${fixtureResult.averages.peakHeapUsedMb ?? "?"}MiB` + : ""; + console.log(`\n Average: ${fixtureResult.averages.totalElapsedMs}ms total${memLine}`); for (const [stage, ms] of Object.entries(fixtureResult.averages.stages)) { const pct = Math.round((ms / fixtureResult.averages.totalElapsedMs) * 100); console.log(` ${stage}: ${ms}ms (${pct}%)`); @@ -226,7 +279,7 @@ async function runBenchmark(): Promise { // Print summary table console.log("\n\nšŸ“Š BENCHMARK SUMMARY"); - console.log("═".repeat(80)); + console.log("═".repeat(95)); console.log( "Fixture".padEnd(25) + "Total".padStart(10) + @@ -234,9 +287,11 @@ async function runBenchmark(): Promise { "Extract".padStart(10) + "Audio".padStart(10) + "Capture".padStart(10) + - "Encode".padStart(10), + "Encode".padStart(10) + + "PeakRSS".padStart(10) + + "PeakHeap".padStart(10), ); - console.log("─".repeat(80)); + console.log("─".repeat(95)); for (const f of results) { const s = f.averages.stages; @@ -247,11 +302,13 @@ async function runBenchmark(): Promise { `${s.videoExtractMs ?? "-"}ms`.padStart(10) + `${s.audioProcessMs ?? "-"}ms`.padStart(10) + `${s.captureMs ?? "-"}ms`.padStart(10) + - `${s.encodeMs ?? "-"}ms`.padStart(10), + `${s.encodeMs ?? "-"}ms`.padStart(10) + + `${f.averages.peakRssMb ?? "-"}MiB`.padStart(10) + + `${f.averages.peakHeapUsedMb ?? "-"}MiB`.padStart(10), ); } - console.log("═".repeat(80)); + console.log("═".repeat(95)); console.log(`\nResults saved to: ${outputPath}`); } diff --git a/packages/producer/src/services/renderOrchestrator.ts b/packages/producer/src/services/renderOrchestrator.ts index 54fcea821..308bb85fd 100644 --- a/packages/producer/src/services/renderOrchestrator.ts +++ b/packages/producer/src/services/renderOrchestrator.ts @@ -239,6 +239,23 @@ export interface RenderPerfSummary { stages: Record; captureAvgMs?: number; capturePeakMs?: number; + /** + * Peak resident set size (RSS) observed during the render, in MiB. + * + * Sampled every 250ms by a process-wide poller; surfaces gross memory + * regressions (e.g. unbounded image-cache growth) that wall-clock numbers + * miss. Optional because callers can serialize older `RenderPerfSummary` + * shapes back into this type. + */ + peakRssMb?: number; + /** + * Peak V8 heap used observed during the render, in MiB. + * + * Useful as a finer-grained complement to {@link peakRssMb} — RSS includes + * native ffmpeg/Chrome allocations, while heapUsed isolates JS-object growth + * inside the orchestrator. Optional for the same back-compat reason. + */ + peakHeapUsedMb?: number; hdrDiagnostics?: HdrDiagnostics; } @@ -963,6 +980,27 @@ export async function executeRenderJob( const chunkedEncodeSize = cfg.chunkSizeFrames; const enableStreamingEncode = cfg.enableStreamingEncode; + // Periodic memory sampler — surfaces peak RSS/heap so the benchmark harness + // can detect memory regressions (e.g. unbounded image-cache growth) that + // wall-clock numbers miss. Sampled every 250ms; the interval is `unref`'d so + // it never keeps the event loop alive on its own, and always cleared in the + // finally block below regardless of how the render exits. + let peakRssBytes = 0; + let peakHeapUsedBytes = 0; + const sampleMemory = (): void => { + try { + const m = process.memoryUsage(); + if (m.rss > peakRssBytes) peakRssBytes = m.rss; + if (m.heapUsed > peakHeapUsedBytes) peakHeapUsedBytes = m.heapUsed; + } catch { + // Defensive: process.memoryUsage() shouldn't throw, but if it ever + // does we don't want to take down the render for a benchmark accessory. + } + }; + sampleMemory(); + const memSamplerInterval: NodeJS.Timeout = setInterval(sampleMemory, 250); + memSamplerInterval.unref?.(); + try { const assertNotAborted = () => { if (abortSignal?.aborted) { @@ -2512,6 +2550,7 @@ export async function executeRenderJob( updateJobStatus(job, "complete", "Render complete", 100, onProgress); const totalElapsed = Date.now() - pipelineStart; + sampleMemory(); const perfSummary: RenderPerfSummary = { renderId: job.id, @@ -2533,6 +2572,8 @@ export async function executeRenderJob( : undefined, captureAvgMs: totalFrames > 0 ? Math.round((perfStages.captureMs ?? 0) / totalFrames) : undefined, + peakRssMb: Math.round(peakRssBytes / (1024 * 1024)), + peakHeapUsedMb: Math.round(peakHeapUsedBytes / (1024 * 1024)), }; job.perfSummary = perfSummary; if (job.config.debug) { @@ -2666,5 +2707,7 @@ export async function executeRenderJob( if (restoreLogger) restoreLogger(); throw error; + } finally { + clearInterval(memSamplerInterval); } } diff --git a/packages/producer/tests/perf/README.md b/packages/producer/tests/perf/README.md new file mode 100644 index 000000000..543fe840d --- /dev/null +++ b/packages/producer/tests/perf/README.md @@ -0,0 +1,100 @@ +# Producer Performance Benchmarks + +End-to-end render benchmark harness driven by `src/benchmark.ts`. Discovers +fixtures under `packages/producer/tests//` (any directory with a +`meta.json`), runs them through the full producer pipeline, and emits per-stage +timing plus peak memory metrics into `benchmark-results.json`. + +The harness is deliberately lightweight — it doesn't enforce thresholds. It's +designed for **regression spotting**: capture a baseline, change something, +re-run, eyeball the diff. For pass/fail thresholds see `tests/perf/baseline.json` +and the perf-regression checks in the integration test suite. + +## Quick start + +```bash +# Run every non-slow fixture once +cd packages/producer +bun run benchmark + +# HDR-only baseline (PQ + HLG fixtures, ~50s on M-series Macs) +bun run bench:hdr + +# Average a fixture across multiple runs +bunx tsx src/benchmark.ts --tags hdr --runs 3 + +# Just the PQ regression +bunx tsx src/benchmark.ts --only hdr-regression + +# Skip slow fixtures explicitly (default behavior; here for clarity) +bunx tsx src/benchmark.ts --exclude-tags slow +``` + +Results are written to +`packages/producer/tests/perf/benchmark-results.json` and a summary table is +printed to stdout. + +## CLI flags + +| Flag | Description | +| --- | --- | +| `--runs N` | Run each fixture `N` times and average (default: 1). | +| `--only ` | Run a single fixture by directory name. | +| `--tags a,b` | **Positive** filter: only fixtures whose `meta.json#tags` contains *any* of the listed tags. | +| `--exclude-tags a,b` | **Negative** filter: skip fixtures with any matching tag. Defaults to `slow`. | + +`--tags` and `--exclude-tags` apply independently — a fixture must match the +positive filter (if any) **and** must not match the negative filter. + +## Reading the output + +Each fixture row prints averaged stage timings plus peak memory: + +``` +Fixture Total Compile Extract Audio Capture Encode PeakRSS PeakHeap +hdr-hlg-regression 11549ms 187ms 520ms 36ms 8373ms 2394ms 227MiB 69MiB +hdr-regression 34452ms 94ms 1268ms 48ms 27034ms 5914ms 272MiB 118MiB +``` + +- **Total** — wall-clock time from job submission to mux-complete. +- **Capture** — frame extraction + composition + alpha blit (HDR path). +- **Encode** — chunked or streaming HDR encoder time (HEVC Main10 for HDR). +- **PeakRSS / PeakHeap** — sampled every 250ms inside `executeRenderJob` from + `process.memoryUsage()`; surfaces gross memory regressions (e.g. unbounded + image-cache growth) that wall-clock numbers miss. RSS includes native + ffmpeg/Chrome allocations; heap is JS-side V8 only. + +## HDR baseline (April 2026) + +Captured on macOS arm64 (M-series), Bun runtime, 1 worker, default config, +single run. These are illustrative — re-baseline locally before comparing your +own runs. + +| Fixture | Total | Capture | Encode | PeakRSS | PeakHeap | Notes | +| --- | ---: | ---: | ---: | ---: | ---: | --- | +| `hdr-hlg-regression` | 11.5s | 8.4s (72%) | 2.4s (21%) | 227 MiB | 69 MiB | 150 frames, 2 HLG sources | +| `hdr-regression` | 34.5s | 27.0s (78%) | 5.9s (17%) | 272 MiB | 118 MiB | 600 frames, 9 PQ sources, shader transition | + +Capture dominates HDR runs (~72-78%). The second-biggest cost is HEVC Main10 +encode. Memory peaks scale with source count and resolution — the PQ +regression's nine HDR sources push heap from ~70 MiB → ~120 MiB. + +When evaluating an HDR optimization (image cache, gated debug logging, etc.) +the metric to watch first is **Capture** ms-per-frame: + +``` +hdr-regression: capture avg 45ms/frame +hdr-hlg-regression: capture avg 56ms/frame +``` + +## When to re-baseline + +- After landing any change that touches `renderOrchestrator.ts`, + `streamingEncoder.ts`, the HDR alpha-blit path, or `frameDirCache.ts`. +- Before opening a PR that claims a perf win — paste before/after numbers in + the PR description. +- Quarterly, even without code changes, to track infra/dependency drift. + +The `bench:hdr` script is the recommended command for routine HDR perf checks +because it filters out non-HDR fixtures (which can be 10Ɨ slower without +contributing signal to HDR-specific work).