diff --git a/docs/docs.json b/docs/docs.json index 9e9b5129a..440ff046a 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -69,6 +69,7 @@ { "group": "Guides", "pages": [ + "guides/website-to-video", "guides/prompting", "guides/gsap-animation", "guides/rendering", diff --git a/docs/guides/website-to-video.mdx b/docs/guides/website-to-video.mdx new file mode 100644 index 000000000..10d3aa1c0 --- /dev/null +++ b/docs/guides/website-to-video.mdx @@ -0,0 +1,227 @@ +--- +title: Website to Video +description: "Capture any website and turn it into a production video with a single prompt." +--- + +Give your AI agent a URL and a creative direction. It captures the site, extracts the brand identity, writes a script and storyboard, generates voiceover, builds animated compositions, and delivers a renderable video. + +``` +"Create a 20-second product launch video from https://linear.app. + Make it feel like an Apple keynote announcement." +``` + +## Getting Started + + + + Skills teach your AI agent how to capture websites and create HyperFrames compositions. Install once — they persist across sessions. + + ```bash + npx skills add heygen-com/hyperframes + ``` + + Works with [Claude Code](https://claude.ai/claude-code), [Cursor](https://cursor.sh), [Gemini CLI](https://github.com/google-gemini/gemini-cli), and [Codex CLI](https://github.com/openai/codex). + + + Open your agent in any directory and describe the video you want: + + ``` + Create a 25-second product launch video from https://example.com. Bold, cinematic, dark theme energy. + ``` + + The agent loads the skill when they see a URL and a video request, and runs the full pipeline — capture, design, script, storyboard, voiceover, build, validate. + + + Agents also trigger this skill automatically when they see a URL and a video request. + + + + ```bash + npx hyperframes preview + ``` + + Opens the video in your browser. Edits reload automatically. + + + ```bash + npx hyperframes render --output my-video.mp4 + ``` + + ``` + ✓ Captured 750 frames in 12.4s + ✓ Encoded to my-video.mp4 (25.0s, 1920×1080, 6.8MB) + ``` + + + + + You don't need to run `npx hyperframes capture` manually — the skill instructs the agent to capture as the first step. The capture command is documented [below](#capture-command) for advanced use. + + +## How the Pipeline Works + +The skill runs 7 steps. Each produces an artifact that feeds the next: + +| Step | Output | What happens | +|------|--------|-------------| +| **Capture** | `captures//` | Extract screenshots, design tokens, fonts, assets, animations | +| **Design** | `DESIGN.md` | Brand reference — colors, typography, do's and don'ts | +| **Script** | `SCRIPT.md` | Narration text with hook, story, proof, CTA | +| **Storyboard** | `STORYBOARD.md` | Per-beat creative direction — mood, assets, animations, transitions | +| **VO + Timing** | `narration.wav` + `transcript.json` | TTS audio with word-level timestamps | +| **Build** | `compositions/*.html` | Animated HTML compositions, one per beat | +| **Validate** | Snapshot PNGs | Visual verification before delivery | + +## Video Types + +The prompt determines the format. Include a duration and creative direction: + +| Type | Duration | Example | +|------|----------|---------| +| Social ad | 10–15s | _"15-second Instagram reel. Energetic, fast cuts."_ | +| Product launch | 20–30s | _"25-second product launch. Apple keynote energy."_ | +| Product tour | 30–60s | _"45-second tour showing the top 3 features."_ | +| Brand reel | 15–30s | _"20-second brand video. Celebrate the design."_ | +| Feature announcement | 15–25s | _"Feature announcement highlighting the new AI agents."_ | +| Teaser | 8–15s | _"10-second teaser. Super minimal. Just the hook."_ | + + + Creative direction matters more than format. _"Playful, hand-crafted feel"_ or _"dark, developer-focused, show code"_ shapes the storyboard and drives every visual decision the agent makes. + + +## Enriching Captures with Gemini Vision + +By default, captures describe assets using DOM context — alt text, nearby headings, CSS classes. Add a [Gemini API key](https://aistudio.google.com/apikey) for richer AI-powered descriptions using vision. + +Create a `.env` file in your project root: + +```bash +echo "GEMINI_API_KEY=your-key-here" > .env +``` + + + + ``` + - hero-bg.png — 582KB, section: "Hero", above fold + ``` + The agent knows the file exists and where it was on the page, but not what it looks like. + + + ``` + - hero-bg.png — 582KB, A gradient wave in purple and blue sweeps + across a dark background, creating an aurora-like effect. + ``` + The agent knows what the image actually shows, enabling better creative decisions in the storyboard. + + + +| Tier | Rate limit | Cost per image | +|------|-----------|----------------| +| Free | 5 RPM | Free | +| Paid | 2,000 RPM | ~$0.001 | + +A typical capture with 40 images costs about **$0.04** on the paid tier. + +## Capture Command + +The skill runs capture automatically, but you can run it directly for pre-caching, debugging, or using the data outside of video production. + +```bash +npx hyperframes capture https://stripe.com +``` + +``` +◇ Captured Stripe | Financial Infrastructure → captures/stripe-com + + Screenshots: 12 + Assets: 45 + Sections: 15 + Fonts: sohne-var +``` + +| Flag | Default | Description | +|------|---------|-------------| +| `-o, --output` | `captures/` | Output directory | +| `--timeout` | `120000` | Page load timeout in ms | +| `--skip-assets` | `false` | Skip downloading images and fonts | +| `--max-screenshots` | `24` | Maximum screenshot count | +| `--json` | `false` | Output structured JSON for programmatic use | + +### What Gets Captured + +| Data | Description | +|------|-------------| +| **Screenshots** | Viewport captures at every scroll depth — dynamic count based on page height | +| **Colors** | Pixel-sampled dominant colors + computed styles, including oklch/lab conversion | +| **Fonts** | CSS font families + downloaded woff2 files | +| **Assets** | Images, SVGs with semantic names, Lottie animations, video previews | +| **Text** | All visible text in DOM order | +| **Animations** | Web Animations API, scroll-triggered animations, WebGL shaders | +| **Sections** | Page structure with headings, types, background colors | +| **CTAs** | Buttons and links detected by class names and text patterns | + +## Snapshot Command + +Capture key frames from a built video as PNGs — verify compositions without a full render: + +```bash +npx hyperframes snapshot my-project --at 2.9,10.4,18.7 +``` + +| Flag | Default | Description | +|------|---------|-------------| +| `--frames` | `5` | Number of evenly-spaced frames | +| `--at` | — | Comma-separated timestamps in seconds | +| `--timeout` | `5000` | Ms to wait for runtime to initialize | + +## Iterating + +You don't need to re-run the full pipeline to make changes: + +- **Edit the storyboard** — `STORYBOARD.md` is the creative north star. Change a beat's mood or assets, then ask the agent to rebuild just that beat. +- **Edit a composition** — open `compositions/beat-3-proof.html` directly and tweak animations, colors, or layout. +- **Rebuild one beat** — _"Rebuild beat 2 with more energy. Use the product screenshot as full-bleed background."_ + +## Troubleshooting + + + + Increase the timeout for sites with Cloudflare or heavy client-side rendering: + + ```bash + npx hyperframes capture https://example.com --timeout 180000 + ``` + + + Sites using frameworks like Framer lazy-load images via IntersectionObserver. The capture scrolls through the page to trigger loading, but very long pages may miss images near the bottom. Adding a Gemini key improves descriptions of captured assets, but doesn't increase the count. + + + The capture uses pixel sampling combined with DOM computed styles. Dark sites should show dark colors in the palette. Check the scroll screenshots in `captures//screenshots/` to see what the capture actually saw. + + + Verify skills are installed: + + ```bash + npx skills add heygen-com/hyperframes + ``` + + Lead your prompt with _"Use the /website-to-hyperframes skill"_ for the most reliable results. Agents also discover it automatically when they see a URL and a video request. + + + +## Next Steps + + + + New to HyperFrames? Start here. + + + Animation patterns used in compositions. + + + Render to MP4, MOV, or WebM. + + + Full command reference. + + diff --git a/docs/packages/cli.mdx b/docs/packages/cli.mdx index 66682edcd..4af280e57 100644 --- a/docs/packages/cli.mdx +++ b/docs/packages/cli.mdx @@ -14,11 +14,13 @@ npx hyperframes ## When to Use **Use the CLI when you want to:** -- Create a new composition project from an example -- Preview compositions with live hot reload during development -- Render compositions to MP4 (locally or in Docker) -- Lint compositions for structural issues -- Check your environment for missing dependencies +- Capture a website for video production (`capture`) +- Create a new composition project from an example (`init`) +- Preview compositions with live hot reload (`preview`) +- Render compositions to MP4 locally or in Docker (`render`) +- Lint compositions for structural issues (`lint`) +- Capture key frames as PNG screenshots (`snapshot`) +- Check your environment for missing dependencies (`doctor`) **Use a different package if you want to:** - Render programmatically from Node.js code — use the [producer](/packages/producer) @@ -321,6 +323,39 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_ Combine `tts` with `transcribe` to generate narration and word-level timestamps for captions in a single workflow: generate the audio with `tts`, then transcribe the output with `transcribe` to get word-level timing. + ### `capture` + + Capture a website — extract screenshots, design tokens, fonts, assets, and animations for video production: + + ```bash + npx hyperframes capture https://stripe.com + npx hyperframes capture https://linear.app -o captures/linear + npx hyperframes capture https://example.com --json + ``` + + ``` + ◇ Captured Stripe | Financial Infrastructure → captures/stripe-com + + Screenshots: 12 + Assets: 45 + Sections: 15 + Fonts: sohne-var + ``` + + | Flag | Description | + |------|-------------| + | `-o, --output` | Output directory (default: `captures/`) | + | `--timeout` | Page load timeout in ms (default: 120000) | + | `--skip-assets` | Skip downloading images and fonts | + | `--max-screenshots` | Maximum screenshot count (default: 24) | + | `--json` | Output structured JSON for programmatic use | + + The capture command extracts everything an AI agent needs to understand a website's visual identity: viewport screenshots at every scroll depth, color palette (pixel-sampled + DOM computed), font files, images with semantic names, SVGs, Lottie animations, video previews, WebGL shaders, visible text, and page structure. + + Output is a self-contained directory with a `CLAUDE.md` file that any AI agent can read to understand the captured site. Used by the `/website-to-hyperframes` skill as step 1 of the video production pipeline. + + Set `GEMINI_API_KEY` in a `.env` file for AI-powered image descriptions via Gemini vision (~$0.001/image). See the [Website to Video](/guides/website-to-video#enriching-captures-with-gemini-vision) guide for details. + ### `preview` @@ -375,6 +410,32 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_ - **Info** (`ℹ`) — informational notices, shown only with `--verbose` The linter detects missing attributes, missing adapter libraries (GSAP, Lottie, Three.js), structural problems, and more. See [Common Mistakes](/guides/common-mistakes) for details on each rule. + + ### `snapshot` + + Capture key frames from a composition as PNG screenshots — verify visual output without a full render: + + ```bash + npx hyperframes snapshot my-project --at 2.9,10.4,18.7 + npx hyperframes snapshot my-project --frames 10 + ``` + + ``` + ◆ Capturing 3 frames at [2.9s, 10.4s, 18.7s] from my-project + + ◇ 3 snapshots saved to snapshots/ + snapshots/frame-00-at-2.9s.png + snapshots/frame-01-at-10.4s.png + snapshots/frame-02-at-18.7s.png + ``` + + | Flag | Description | + |------|-------------| + | `--frames` | Number of evenly-spaced frames to capture (default: 5) | + | `--at` | Comma-separated timestamps in seconds (e.g., `3.0,10.5,18.0`) | + | `--timeout` | Ms to wait for runtime to initialize (default: 5000) | + + The snapshot command bundles the project, serves it locally, launches headless Chrome, seeks to each timestamp, and captures a 1920×1080 PNG. Useful for visual verification during the build step of the [website-to-video](/guides/website-to-video) workflow. ### `render` diff --git a/packages/cli/src/capture/contentExtractor.ts b/packages/cli/src/capture/contentExtractor.ts index 028586b59..f376944f3 100644 --- a/packages/cli/src/capture/contentExtractor.ts +++ b/packages/cli/src/capture/contentExtractor.ts @@ -174,7 +174,11 @@ export async function captionImagesWithGemini( // Free tier: 5 RPM → batch 5, 12s pause (~$0 but slow) // Paid tier: 2000 RPM → batch 20, 1s pause (~$0.001/image, fast) // We try a larger batch first; if rate-limited, fall back to smaller batches. - const model = "gemini-2.5-flash"; + // Default is a preview model — update when GA ships. + // Benchmark (49 images, paid tier): 3.1-flash-lite-preview ~507ms/img 131ch avg, + // 2.5-flash-lite ~230ms/img 117ch avg. Preview has richer captions but higher variance. + // Override: HYPERFRAMES_GEMINI_MODEL=gemini-2.5-flash-lite + const model = process.env.HYPERFRAMES_GEMINI_MODEL || "gemini-3.1-flash-lite-preview"; const BATCH_SIZE = 20; for (let i = 0; i < imageFiles.length; i += BATCH_SIZE) { const batch = imageFiles.slice(i, i + BATCH_SIZE); @@ -210,7 +214,7 @@ export async function captionImagesWithGemini( geminiCaptions[result.value.file] = result.value.caption; } } - // Pace requests to stay under free tier rate limits (5 RPM for gemini-2.5-flash) + // Pace requests between batches (paid tier: 2000+ RPM, free tier: rate-limited) if (i + BATCH_SIZE < imageFiles.length) { await new Promise((r) => setTimeout(r, 2000)); // 2s pause between batches — paid tier handles 2000 RPM, free tier retries via Promise.allSettled } diff --git a/packages/cli/src/capture/scaffolding.ts b/packages/cli/src/capture/scaffolding.ts index 97dd2d693..e0db0fea8 100644 --- a/packages/cli/src/capture/scaffolding.ts +++ b/packages/cli/src/capture/scaffolding.ts @@ -61,51 +61,13 @@ export async function generateProjectScaffold( progress: (stage: string, detail?: string) => void, warnings: string[], ): Promise { - // Ensure capture output is a valid HyperFrames project (index.html + meta.json) - const indexPath = join(outputDir, "index.html"); + // Capture output is a DATA folder, not a video project. + // The agent builds index.html + compositions/ during step 6. + // We only write meta.json (project metadata) — NOT index.html. + // Writing index.html here caused a double-audio bug: the runtime + // discovered both the scaffold and the agent's real index.html as + // valid compositions, playing two audio tracks offset in time. const metaPath = join(outputDir, "meta.json"); - if (!existsSync(indexPath)) { - writeFileSync( - indexPath, - ` - - - - - - - - - -
- - -
-
-
-
- - - - - - -
- - - - -`, - "utf-8", - ); - } if (!existsSync(metaPath)) { const hostname = new URL(url).hostname.replace(/^www\./, ""); writeFileSync( diff --git a/packages/cli/src/utils/lintProject.test.ts b/packages/cli/src/utils/lintProject.test.ts index c3e0b6acd..c3773945d 100644 --- a/packages/cli/src/utils/lintProject.test.ts +++ b/packages/cli/src/utils/lintProject.test.ts @@ -326,6 +326,146 @@ describe("audio_src_not_found", () => { }); }); +describe("multiple_root_compositions", () => { + it("fires when two HTML files have data-composition-id", () => { + const project = makeProject(validHtml()); + writeFileSync( + join(project.dir, "scaffold.html"), + '
', + ); + const { totalErrors, results } = lintProject(project); + const finding = results[0]?.result.findings.find( + (f) => f.code === "multiple_root_compositions", + ); + expect(finding).toBeDefined(); + expect(finding?.severity).toBe("error"); + expect(finding?.message).toContain("scaffold.html"); + expect(totalErrors).toBeGreaterThan(0); + }); + + it("does NOT fire with a single root composition", () => { + const project = makeProject(validHtml()); + const { results } = lintProject(project); + const finding = results[0]?.result.findings.find( + (f) => f.code === "multiple_root_compositions", + ); + expect(finding).toBeUndefined(); + }); + + it("ignores HTML files without data-composition-id", () => { + const project = makeProject(validHtml()); + writeFileSync(join(project.dir, "readme.html"), "Not a composition"); + const { results } = lintProject(project); + const finding = results[0]?.result.findings.find( + (f) => f.code === "multiple_root_compositions", + ); + expect(finding).toBeUndefined(); + }); +}); + +describe("duplicate_audio_track", () => { + it("detects overlapping audio with attributes in any order", () => { + // The original scaffold bug: data-start BEFORE data-track-index + const html = ` +
+
+ +`; + const project = makeProject(html); + const { results } = lintProject(project); + const finding = results[0]?.result.findings.find((f) => f.code === "duplicate_audio_track"); + expect(finding).toBeDefined(); + expect(finding?.severity).toBe("warning"); + }); + + it("does NOT fire for non-overlapping audio on the same track", () => { + const html = ` +
+
+ +`; + const project = makeProject(html); + const { results } = lintProject(project); + const finding = results[0]?.result.findings.find((f) => f.code === "duplicate_audio_track"); + expect(finding).toBeUndefined(); + }); + + it("does NOT fire for audio on different tracks", () => { + const html = ` +
+
+ +`; + const project = makeProject(html); + const { results } = lintProject(project); + const finding = results[0]?.result.findings.find((f) => f.code === "duplicate_audio_track"); + expect(finding).toBeUndefined(); + }); + + it("deduplicates same audio found in root + sub-composition", () => { + const project = makeProject(validHtmlWithAudio(), { + "scene.html": validHtmlWithAudio("scene"), + }); + writeFileSync(join(project.dir, "song.mp3"), "fake"); + const { results } = lintProject(project); + const finding = results[0]?.result.findings.find((f) => f.code === "duplicate_audio_track"); + expect(finding).toBeUndefined(); + }); + + it("detects overlap when data-duration is missing (Infinity fallback)", () => { + const html = ` +
+
+ +`; + const project = makeProject(html); + const { results } = lintProject(project); + const finding = results[0]?.result.findings.find((f) => f.code === "duplicate_audio_track"); + expect(finding).toBeDefined(); + }); + + it("formats Infinity end times as 'end' without crashing", () => { + const html = ` +
+
+ +`; + const project = makeProject(html); + const { results } = lintProject(project); + const finding = results[0]?.result.findings.find((f) => f.code === "duplicate_audio_track"); + expect(finding).toBeDefined(); + expect(finding?.message).toContain("end"); + expect(finding?.message).not.toContain("Infinity"); + }); + + it("finds audio across multiple HTML sources (g-flag regression)", () => { + const project = makeProject(validHtmlWithAudio(), { + "scene.html": ` +
+
+ +`, + }); + writeFileSync(join(project.dir, "song.mp3"), "fake"); + writeFileSync(join(project.dir, "music.wav"), "fake"); + const { results } = lintProject(project); + const finding = results[0]?.result.findings.find((f) => f.code === "duplicate_audio_track"); + // song.mp3@0 (from validHtmlWithAudio, no data-duration → Infinity) and music.wav@5-25 overlap + expect(finding).toBeDefined(); + }); +}); + describe("shouldBlockRender", () => { it("default: does not block on errors", () => { expect(shouldBlockRender(false, false, 5, 0)).toBe(false); diff --git a/packages/cli/src/utils/lintProject.ts b/packages/cli/src/utils/lintProject.ts index cb0354632..3519baad3 100644 --- a/packages/cli/src/utils/lintProject.ts +++ b/packages/cli/src/utils/lintProject.ts @@ -53,6 +53,8 @@ export function lintProject(project: ProjectDir): ProjectLintResult { const projectFindings = [ ...lintProjectAudioFiles(project.dir, allHtmlSources), ...lintAudioSrcNotFound(project.dir, allHtmlSources), + ...lintMultipleRootCompositions(project.dir), + ...lintDuplicateAudioTracks(allHtmlSources), ]; if (projectFindings.length > 0) { // Append project-level findings to the root index.html result @@ -107,7 +109,7 @@ function lintProjectAudioFiles(projectDir: string, htmlSources: string[]): Hyper fixHint: 'Add an element inside the composition root.', + '" data-start="0" data-duration="__DURATION__" data-track-index="0" data-volume="1"> element inside the composition root. Replace __DURATION__ with the audio length in seconds.', }); } @@ -154,6 +156,98 @@ function lintAudioSrcNotFound(projectDir: string, htmlSources: string[]): Hyperf return findings; } +/** + * Error if multiple root-level HTML files with data-composition-id exist. + * Scans the project directory filesystem (not just what lintProject chose to read) + * to catch stray scaffold files, duplicates, or backup copies. + */ +function lintMultipleRootCompositions(projectDir: string): HyperframeLintFinding[] { + const findings: HyperframeLintFinding[] = []; + try { + const rootHtmlFiles = readdirSync(projectDir).filter((f) => f.endsWith(".html")); + const rootCompositions: string[] = []; + for (const file of rootHtmlFiles) { + const content = readFileSync(join(projectDir, file), "utf-8"); + if (/data-composition-id/i.test(content)) { + rootCompositions.push(file); + } + } + if (rootCompositions.length > 1) { + findings.push({ + code: "multiple_root_compositions", + severity: "error", + message: `Multiple root-level HTML files with data-composition-id: ${rootCompositions.join(", ")}. The runtime may discover both as entry points, causing duplicate audio playback.`, + fixHint: + "A project should have exactly one root index.html with data-composition-id. Remove or rename extra files.", + }); + } + } catch { + /* directory read failed — skip */ + } + return findings; +} + +/** + * Warn if multiple