Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/producer/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
"perf:gate": "tsx src/perf-gate.ts",
"check:runtime-conformance": "tsx src/runtime-conformance.ts",
"benchmark": "tsx src/benchmark.ts",
"bench:hdr": "tsx src/benchmark.ts --tags hdr",
"test": "tsx src/regression-harness.ts",
"test:update": "tsx src/regression-harness.ts --update",
"docker:build:test": "docker build -f ../../Dockerfile.test -t hyperframes-producer:test ../..",
Expand Down
103 changes: 80 additions & 23 deletions packages/producer/src/benchmark.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,21 @@
/**
* Render Benchmark
*
* Runs each test fixture multiple times and records per-stage timing.
* Results are saved to producer/tests/perf/benchmark-results.json.
* Runs each test fixture multiple times and records per-stage timing
* plus peak heap/RSS memory. Results are saved to
* producer/tests/perf/benchmark-results.json.
*
* Usage:
* bun run benchmark # 3 runs per fixture (default)
* bun run benchmark -- --runs 5 # 5 runs per fixture
* bun run benchmark -- --only chat # single fixture
* bun run benchmark -- --exclude-tags slow
* bun run benchmark -- --tags hdr # only fixtures tagged "hdr"
* bun run bench:hdr # convenience: --tags hdr
*
* `--tags` and `--exclude-tags` may be passed together; a fixture must match
* at least one positive tag (when `--tags` is provided) AND must not match
* any excluded tag.
*/

import {
Expand Down Expand Up @@ -52,6 +59,10 @@ interface FixtureResult {
averages: {
totalElapsedMs: number;
captureAvgMs: number | null;
/** Average of per-run peak RSS in MiB. `null` if no run reported memory. */
peakRssMb: number | null;
/** Average of per-run peak heapUsed in MiB. `null` if no run reported memory. */
peakHeapUsedMb: number | null;
stages: Record<string, number>;
};
}
Expand All @@ -64,9 +75,19 @@ interface BenchmarkResults {
fixtures: FixtureResult[];
}

function parseArgs(): { runs: number; only: string | null; excludeTags: string[] } {
interface BenchmarkArgs {
runs: number;
only: string | null;
/** Positive tag filter — fixture must include at least one. Empty = no positive filter. */
tags: string[];
/** Negative tag filter — fixture must not include any. Applied after `tags`. */
excludeTags: string[];
}

function parseArgs(): BenchmarkArgs {
let runs = 3;
let only: string | null = null;
const tags: string[] = [];
const excludeTags: string[] = [];

for (let i = 2; i < process.argv.length; i++) {
Expand All @@ -76,17 +97,21 @@ function parseArgs(): { runs: number; only: string | null; excludeTags: string[]
} else if (process.argv[i] === "--only" && process.argv[i + 1]) {
i++;
only = process.argv[i] ?? null;
} else if (process.argv[i] === "--tags" && process.argv[i + 1]) {
i++;
tags.push(...(process.argv[i] ?? "").split(",").filter(Boolean));
} else if (process.argv[i] === "--exclude-tags" && process.argv[i + 1]) {
i++;
excludeTags.push(...(process.argv[i] ?? "").split(","));
excludeTags.push(...(process.argv[i] ?? "").split(",").filter(Boolean));
}
}

return { runs, only, excludeTags };
return { runs, only, tags, excludeTags };
}

function discoverFixtures(
only: string | null,
tags: string[],
excludeTags: string[],
): Array<{ id: string; dir: string; meta: TestMeta }> {
const fixtures: Array<{ id: string; dir: string; meta: TestMeta }> = [];
Expand All @@ -101,7 +126,11 @@ function discoverFixtures(
if (only && entry !== only) continue;

const meta: TestMeta = JSON.parse(readFileSync(metaPath, "utf-8"));
if (excludeTags.length > 0 && meta.tags?.some((t) => excludeTags.includes(t))) continue;
const fixtureTags = meta.tags ?? [];
// Positive filter (--tags): if provided, fixture must match at least one.
if (tags.length > 0 && !fixtureTags.some((t) => tags.includes(t))) continue;
// Negative filter (--exclude-tags): always wins.
if (excludeTags.length > 0 && fixtureTags.some((t) => excludeTags.includes(t))) continue;

fixtures.push({ id: entry, dir, meta });
}
Expand All @@ -114,16 +143,35 @@ function avg(nums: number[]): number {
return Math.round(nums.reduce((a, b) => a + b, 0) / nums.length);
}

/**
* Average a possibly-empty list of optional numbers. Returns `null` when no
* defined samples exist so the JSON output stays consistent with the
* `peakRssMb: number | null` shape the consumer (perf README, regression
* checks) expects — silently coercing missing memory data to `0` would mask
* older results regenerated against this harness.
*/
function avgOrNull(nums: Array<number | null | undefined>): number | null {
const filtered = nums.filter((n): n is number => typeof n === "number");
if (filtered.length === 0) return null;
return avg(filtered);
}

async function runBenchmark(): Promise<void> {
const { runs, only, excludeTags } = parseArgs();
const fixtures = discoverFixtures(only, excludeTags);
const { runs, only, tags, excludeTags } = parseArgs();
const fixtures = discoverFixtures(only, tags, excludeTags);

if (fixtures.length === 0) {
console.error("No fixtures found");
console.error(
`No fixtures found${tags.length ? ` matching tags=[${tags.join(",")}]` : ""}` +
`${excludeTags.length ? ` excluding=[${excludeTags.join(",")}]` : ""}`,
);
process.exit(1);
}

console.log(`\n🏁 Benchmark: ${fixtures.length} fixture(s) × ${runs} run(s)\n`);
const filterDesc =
(tags.length ? ` tags=[${tags.join(",")}]` : "") +
(excludeTags.length ? ` exclude=[${excludeTags.join(",")}]` : "");
console.log(`\n🏁 Benchmark: ${fixtures.length} fixture(s) × ${runs} run(s)${filterDesc}\n`);

const results: FixtureResult[] = [];

Expand Down Expand Up @@ -162,8 +210,12 @@ async function runBenchmark(): Promise<void> {
if (job.perfSummary) {
fixtureRuns.push({ run: r + 1, perfSummary: job.perfSummary });
const ps = job.perfSummary;
const memDesc =
ps.peakRssMb != null || ps.peakHeapUsedMb != null
? ` | peak RSS ${ps.peakRssMb ?? "?"}MiB heap ${ps.peakHeapUsedMb ?? "?"}MiB`
: "";
console.log(
` ✓ ${ps.totalElapsedMs}ms total | capture avg ${ps.captureAvgMs ?? "?"}ms/frame | ${ps.totalFrames} frames`,
` ✓ ${ps.totalElapsedMs}ms total | capture avg ${ps.captureAvgMs ?? "?"}ms/frame | ${ps.totalFrames} frames${memDesc}`,
);
}
}
Expand Down Expand Up @@ -192,19 +244,20 @@ async function runBenchmark(): Promise<void> {
runs: fixtureRuns,
averages: {
totalElapsedMs: avg(fixtureRuns.map((r) => r.perfSummary.totalElapsedMs)),
captureAvgMs:
avg(
fixtureRuns
.filter((r) => r.perfSummary.captureAvgMs != null)
.map((r) => r.perfSummary.captureAvgMs ?? 0),
) || null,
captureAvgMs: avgOrNull(fixtureRuns.map((r) => r.perfSummary.captureAvgMs)),
peakRssMb: avgOrNull(fixtureRuns.map((r) => r.perfSummary.peakRssMb)),
peakHeapUsedMb: avgOrNull(fixtureRuns.map((r) => r.perfSummary.peakHeapUsedMb)),
stages: avgStages,
},
};

results.push(fixtureResult);

console.log(`\n Average: ${fixtureResult.averages.totalElapsedMs}ms total`);
const memLine =
fixtureResult.averages.peakRssMb != null || fixtureResult.averages.peakHeapUsedMb != null
? ` | peak RSS ${fixtureResult.averages.peakRssMb ?? "?"}MiB heap ${fixtureResult.averages.peakHeapUsedMb ?? "?"}MiB`
: "";
console.log(`\n Average: ${fixtureResult.averages.totalElapsedMs}ms total${memLine}`);
for (const [stage, ms] of Object.entries(fixtureResult.averages.stages)) {
const pct = Math.round((ms / fixtureResult.averages.totalElapsedMs) * 100);
console.log(` ${stage}: ${ms}ms (${pct}%)`);
Expand All @@ -226,17 +279,19 @@ async function runBenchmark(): Promise<void> {

// Print summary table
console.log("\n\n📊 BENCHMARK SUMMARY");
console.log("═".repeat(80));
console.log("═".repeat(95));
console.log(
"Fixture".padEnd(25) +
"Total".padStart(10) +
"Compile".padStart(10) +
"Extract".padStart(10) +
"Audio".padStart(10) +
"Capture".padStart(10) +
"Encode".padStart(10),
"Encode".padStart(10) +
"PeakRSS".padStart(10) +
"PeakHeap".padStart(10),
);
console.log("─".repeat(80));
console.log("─".repeat(95));

for (const f of results) {
const s = f.averages.stages;
Expand All @@ -247,11 +302,13 @@ async function runBenchmark(): Promise<void> {
`${s.videoExtractMs ?? "-"}ms`.padStart(10) +
`${s.audioProcessMs ?? "-"}ms`.padStart(10) +
`${s.captureMs ?? "-"}ms`.padStart(10) +
`${s.encodeMs ?? "-"}ms`.padStart(10),
`${s.encodeMs ?? "-"}ms`.padStart(10) +
`${f.averages.peakRssMb ?? "-"}MiB`.padStart(10) +
`${f.averages.peakHeapUsedMb ?? "-"}MiB`.padStart(10),
);
}

console.log("═".repeat(80));
console.log("═".repeat(95));
console.log(`\nResults saved to: ${outputPath}`);
}

Expand Down
43 changes: 43 additions & 0 deletions packages/producer/src/services/renderOrchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,23 @@ export interface RenderPerfSummary {
stages: Record<string, number>;
captureAvgMs?: number;
capturePeakMs?: number;
/**
* Peak resident set size (RSS) observed during the render, in MiB.
*
* Sampled every 250ms by a process-wide poller; surfaces gross memory
* regressions (e.g. unbounded image-cache growth) that wall-clock numbers
* miss. Optional because callers can serialize older `RenderPerfSummary`
* shapes back into this type.
*/
peakRssMb?: number;
/**
* Peak V8 heap used observed during the render, in MiB.
*
* Useful as a finer-grained complement to {@link peakRssMb} — RSS includes
* native ffmpeg/Chrome allocations, while heapUsed isolates JS-object growth
* inside the orchestrator. Optional for the same back-compat reason.
*/
peakHeapUsedMb?: number;
hdrDiagnostics?: HdrDiagnostics;
}

Expand Down Expand Up @@ -935,6 +952,27 @@ export async function executeRenderJob(
const chunkedEncodeSize = cfg.chunkSizeFrames;
const enableStreamingEncode = cfg.enableStreamingEncode;

// Periodic memory sampler — surfaces peak RSS/heap so the benchmark harness
// can detect memory regressions (e.g. unbounded image-cache growth) that
// wall-clock numbers miss. Sampled every 250ms; the interval is `unref`'d so
// it never keeps the event loop alive on its own, and always cleared in the
// finally block below regardless of how the render exits.
let peakRssBytes = 0;
let peakHeapUsedBytes = 0;
const sampleMemory = (): void => {
try {
const m = process.memoryUsage();
if (m.rss > peakRssBytes) peakRssBytes = m.rss;
if (m.heapUsed > peakHeapUsedBytes) peakHeapUsedBytes = m.heapUsed;
} catch {
// Defensive: process.memoryUsage() shouldn't throw, but if it ever
// does we don't want to take down the render for a benchmark accessory.
}
};
sampleMemory();
const memSamplerInterval: NodeJS.Timeout = setInterval(sampleMemory, 250);
memSamplerInterval.unref?.();

try {
const assertNotAborted = () => {
if (abortSignal?.aborted) {
Expand Down Expand Up @@ -2484,6 +2522,7 @@ export async function executeRenderJob(
updateJobStatus(job, "complete", "Render complete", 100, onProgress);

const totalElapsed = Date.now() - pipelineStart;
sampleMemory();

const perfSummary: RenderPerfSummary = {
renderId: job.id,
Expand All @@ -2505,6 +2544,8 @@ export async function executeRenderJob(
: undefined,
captureAvgMs:
totalFrames > 0 ? Math.round((perfStages.captureMs ?? 0) / totalFrames) : undefined,
peakRssMb: Math.round(peakRssBytes / (1024 * 1024)),
peakHeapUsedMb: Math.round(peakHeapUsedBytes / (1024 * 1024)),
};
job.perfSummary = perfSummary;
if (job.config.debug) {
Expand Down Expand Up @@ -2638,5 +2679,7 @@ export async function executeRenderJob(

if (restoreLogger) restoreLogger();
throw error;
} finally {
clearInterval(memSamplerInterval);
}
}
100 changes: 100 additions & 0 deletions packages/producer/tests/perf/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# Producer Performance Benchmarks

End-to-end render benchmark harness driven by `src/benchmark.ts`. Discovers
fixtures under `packages/producer/tests/<fixture>/` (any directory with a
`meta.json`), runs them through the full producer pipeline, and emits per-stage
timing plus peak memory metrics into `benchmark-results.json`.

The harness is deliberately lightweight — it doesn't enforce thresholds. It's
designed for **regression spotting**: capture a baseline, change something,
re-run, eyeball the diff. For pass/fail thresholds see `tests/perf/baseline.json`
and the perf-regression checks in the integration test suite.

## Quick start

```bash
# Run every non-slow fixture once
cd packages/producer
bun run benchmark

# HDR-only baseline (PQ + HLG fixtures, ~50s on M-series Macs)
bun run bench:hdr

# Average a fixture across multiple runs
bunx tsx src/benchmark.ts --tags hdr --runs 3

# Just the PQ regression
bunx tsx src/benchmark.ts --only hdr-regression

# Skip slow fixtures explicitly (default behavior; here for clarity)
bunx tsx src/benchmark.ts --exclude-tags slow
```

Results are written to
`packages/producer/tests/perf/benchmark-results.json` and a summary table is
printed to stdout.

## CLI flags

| Flag | Description |
| --- | --- |
| `--runs N` | Run each fixture `N` times and average (default: 1). |
| `--only <id>` | Run a single fixture by directory name. |
| `--tags a,b` | **Positive** filter: only fixtures whose `meta.json#tags` contains *any* of the listed tags. |
| `--exclude-tags a,b` | **Negative** filter: skip fixtures with any matching tag. Defaults to `slow`. |

`--tags` and `--exclude-tags` apply independently — a fixture must match the
positive filter (if any) **and** must not match the negative filter.

## Reading the output

Each fixture row prints averaged stage timings plus peak memory:

```
Fixture Total Compile Extract Audio Capture Encode PeakRSS PeakHeap
hdr-hlg-regression 11549ms 187ms 520ms 36ms 8373ms 2394ms 227MiB 69MiB
hdr-regression 34452ms 94ms 1268ms 48ms 27034ms 5914ms 272MiB 118MiB
```

- **Total** — wall-clock time from job submission to mux-complete.
- **Capture** — frame extraction + composition + alpha blit (HDR path).
- **Encode** — chunked or streaming HDR encoder time (HEVC Main10 for HDR).
- **PeakRSS / PeakHeap** — sampled every 250ms inside `executeRenderJob` from
`process.memoryUsage()`; surfaces gross memory regressions (e.g. unbounded
image-cache growth) that wall-clock numbers miss. RSS includes native
ffmpeg/Chrome allocations; heap is JS-side V8 only.

## HDR baseline (April 2026)

Captured on macOS arm64 (M-series), Bun runtime, 1 worker, default config,
single run. These are illustrative — re-baseline locally before comparing your
own runs.

| Fixture | Total | Capture | Encode | PeakRSS | PeakHeap | Notes |
| --- | ---: | ---: | ---: | ---: | ---: | --- |
| `hdr-hlg-regression` | 11.5s | 8.4s (72%) | 2.4s (21%) | 227 MiB | 69 MiB | 150 frames, 2 HLG sources |
| `hdr-regression` | 34.5s | 27.0s (78%) | 5.9s (17%) | 272 MiB | 118 MiB | 600 frames, 9 PQ sources, shader transition |

Capture dominates HDR runs (~72-78%). The second-biggest cost is HEVC Main10
encode. Memory peaks scale with source count and resolution — the PQ
regression's nine HDR sources push heap from ~70 MiB → ~120 MiB.

When evaluating an HDR optimization (image cache, gated debug logging, etc.)
the metric to watch first is **Capture** ms-per-frame:

```
hdr-regression: capture avg 45ms/frame
hdr-hlg-regression: capture avg 56ms/frame
```

## When to re-baseline

- After landing any change that touches `renderOrchestrator.ts`,
`streamingEncoder.ts`, the HDR alpha-blit path, or `frameDirCache.ts`.
- Before opening a PR that claims a perf win — paste before/after numbers in
the PR description.
- Quarterly, even without code changes, to track infra/dependency drift.

The `bench:hdr` script is the recommended command for routine HDR perf checks
because it filters out non-HDR fixtures (which can be 10× slower without
contributing signal to HDR-specific work).
Loading