From 6802e3bc9dba27a0a09eee5759d1e02d1b7408bb Mon Sep 17 00:00:00 2001 From: James Date: Wed, 22 Apr 2026 21:30:09 +0000 Subject: [PATCH 1/2] ci(regression): build test Docker image once, share across shards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Splits regression.yml into a `build-image` job + the existing `regression-shards` matrix. The build job produces a Docker tarball via `docker/build-push-action` with `outputs: type=docker,dest=...`, uploads it as a GHA artifact (retention 1 day, gzip level 1), and each shard downloads + `docker load`s it instead of rebuilding. Measured on PR #419 regression runs before the change: - Docker build step: ~234s per shard WITH GHA layer cache hit - 11 shards × ~234s = ~43 min of runner time per PR just on redundant image builds Cold-cache cases are much worse — happening right now on PR #419 after release commit b6f50ce bumped every `packages/*/package.json`, invalidating the COPY layer that feeds `bun install --frozen-lockfile`. All 10 shards are currently 25-30+ min into a parallel rebuild, thundering-herding the same npm packages from 10 runners. After this change: - 1× build (~4 min warm, ~15 min cold) + 11× (download + `docker load`) - Expected ~15-20s overhead per shard for artifact download + load - Net savings: ~30-40 min of runner time per PR run on warm cache, substantially more on cold cache The build job doesn't checkout LFS — Dockerfile.test only COPYs source + package manifests, never the golden baselines, so the image build never needed LFS. Shards still need LFS for the tests/**/output/output.mp4 baselines they validate against. --- .github/workflows/regression.yml | 64 ++++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 12 deletions(-) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 73871aa2..9acee6e7 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -30,10 +30,52 @@ jobs: - "packages/engine/**" - "Dockerfile*" - regression-shards: + # Build the regression Docker image once, export it as a tarball, and upload + # as an artifact. Each matrix shard then downloads + `docker load`s it instead + # of rebuilding from cache. Measured on PR #419: the Docker build step takes + # ~4 min per shard even with GHA cache, so 11 shards = ~44 min of redundant + # build time per run. This job replaces that with a single ~4 min build plus + # ~15s of artifact download per shard. + build-image: + name: Build regression test image needs: changes if: needs.changes.outputs.code == 'true' runs-on: ubuntu-latest + timeout-minutes: 20 + steps: + - name: Checkout + uses: actions/checkout@v4 + # No LFS needed here — Dockerfile.test only copies source + package manifests, + # not the golden baselines under packages/producer/tests/**/output. + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build test image to tarball + uses: docker/build-push-action@v6 + with: + context: . + file: Dockerfile.test + tags: hyperframes-producer:test + cache-from: type=gha,scope=regression-test-image + cache-to: type=gha,mode=max,scope=regression-test-image + outputs: type=docker,dest=/tmp/regression-test-image.tar + + - name: Report image size + run: ls -lh /tmp/regression-test-image.tar + + - name: Upload image artifact + uses: actions/upload-artifact@v4 + with: + name: regression-test-image + path: /tmp/regression-test-image.tar + retention-days: 1 + compression-level: 1 + + regression-shards: + needs: [changes, build-image] + if: needs.changes.outputs.code == 'true' + runs-on: ubuntu-latest timeout-minutes: 40 strategy: fail-fast: false @@ -79,18 +121,16 @@ jobs: fi done - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Build test Docker image (cached) - uses: docker/build-push-action@v6 + - name: Download test image artifact + uses: actions/download-artifact@v4 with: - context: . - file: Dockerfile.test - load: true - tags: hyperframes-producer:test - cache-from: type=gha,scope=regression-test-image - cache-to: type=gha,mode=max,scope=regression-test-image + name: regression-test-image + path: /tmp + + - name: Load test image + run: | + docker load -i /tmp/regression-test-image.tar + docker image ls hyperframes-producer:test - name: "Run regression shard: ${{ matrix.shard }}" run: | From e09775cc0191004868bfd7e0a3f56fc399e75d0a Mon Sep 17 00:00:00 2001 From: James Date: Wed, 22 Apr 2026 21:34:57 +0000 Subject: [PATCH 2/2] ci(regression): add explicit least-privilege permissions Addresses CodeQL warning 'Workflow does not contain permissions'. Defaults the workflow GITHUB_TOKEN to `contents: read` only. The build-image job elevates to `actions: write` because `docker/build-push-action` with `cache-from/to: type=gha` uses the GitHub Actions cache API, which needs read+write on the actions scope. --- .github/workflows/regression.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 9acee6e7..c8dd04eb 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -11,6 +11,12 @@ concurrency: group: regression-${{ github.ref }} cancel-in-progress: true +# Least-privilege token: only reading code. Jobs that need more (e.g. GHA +# cache reads/writes from docker/build-push-action with `type=gha`) elevate +# their own permissions inline. +permissions: + contents: read + jobs: changes: name: Detect changes @@ -42,6 +48,9 @@ jobs: if: needs.changes.outputs.code == 'true' runs-on: ubuntu-latest timeout-minutes: 20 + permissions: + contents: read + actions: write # docker/build-push-action `type=gha` cache reads + writes steps: - name: Checkout uses: actions/checkout@v4