From 9b7531b9086731400a5b14d5a4c1ab594d1fddda Mon Sep 17 00:00:00 2001
From: ukimsanov <ular.kimsanov@heygen.com>
Date: Sun, 14 Jun 2026 00:39:59 -0700
Subject: [PATCH 1/2] fix(capture/lint/producer): pipeline robustness fixes
 from real-AI-test runs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Surgical bugfixes accumulated across a series of real-AI-test runs
(heygen.com, huly.io, heygen-showcase). Each fix targets a specific
observed defect; happy paths are untouched.

packages/cli/src/capture/

  • assetCataloger.ts: surface three structural logo signals on every
    cataloged asset (inBanner / inHomeLink / matchesTitleBrand). The
    prior class-substring-only isLogo detector caught 0/32 SVGs on
    heygen.com and 0/19 on huly.io — modern React/Tailwind builds
    don't put "logo" or "brand" in any className. The new signals
    catch the universal "site header logo" pattern. Boolean merge
    semantics: any positive sample wins through context-merge +
    srcset dedup.

  • tokenExtractor.ts: broaden inline-SVG isLogo via the same three
    structural signals (header/nav/role=banner ancestor, root-href
    anchor parent, document.title brand-segment match in aria-label).
    No change to the existing class-substring detector — runs first,
    new heuristics only fire when it misses.

  • assetDownloader.ts: content-hash SVG slugs. SVG filenames are now
    `svg-<8char-sha1>.svg` (or `logo-<hash>.svg` when isLogo flags
    fire), replacing the previous label-derived slugging that
    mis-attributed brand carousels. Verified by rasterizing real
    captured SVGs: heygen-logo.svg actually contained the Google
    wordmark, hubspot-logo.svg contained Trivago, huly-logo.svg
    contained "Kube", heygen-logo.svg → "oogo". Catalog → URL label
    inference (aria-label / nearest-heading / sectionClasses) is too
    drift-prone across partner-logo carousels; content-hash names are
    invariant by construction.

  • contentExtractor.ts: SVG→PNG rasterization via sharp before
    sending to Gemini Vision. Previous path sent raw SVG markup as
    text and hit pure-hallucination output on wordmarks (VIVIENNE
    for HubSpot, "wrestling" for Workday). Vision models can read
    PNG pixels reliably; they cannot mental-render path commands.
    Adds polarity detection (white-glyph vs dark-glyph) so an SVG
    that flattens to a blank PNG against the wrong background gets
    inverted automatically before captioning.

  • contentExtractor.ts: LOGO tag in asset-descriptions.md lines
    when the structural signals fire (independent of Gemini). The
    no-Gemini-key fallback still emits an ⚠ banner + the LOGO-tagged
    lines so agents can grep for logos via filename pattern even
    without Vision.

  • index.ts: asset-descriptions.md header branches on Gemini-key
    presence with an explicit "Vision was OFF, descriptions are
    catalog-derived" warning + a fallback recipe ("open LOGO-tagged
    SVGs in a previewer before referencing"). Progress message also
    reports catalog-fallback mode.

  • capture/assetCataloger.ts + capture/tokenExtractor.ts regex
    escape: `/^https?:\\/\\/[^/]+\\/?$/` inside the page.evaluate
    template literal. The original `/^https?:\/\/[^/]+\/?$/` was
    collapsing `\/` to `/` inside the template (because backslash
    before a non-escape char is consumed), producing a parse error
    on every capture. Capture against heygen.com and huly.io both
    100% blocked on this until the escape was fixed.

packages/core/src/lint/utils.ts

  • findRootTag masks <!-- ... -->, <style>...</style>, and
    <script>...</script> ranges before tag extraction. A literal
    <video> token inside a CSS comment (`/* The card uses <video>
    as the surface */`) inside a <style> block was being picked as
    the composition root, producing two cascading false errors
    (root_missing_composition_id + root_missing_dimensions).
    Verified against a synthetic repro plus the real beat that hit
    this. Existing stripJsComments / extractScriptTextsAndSrcs
    exports preserved — earlier work-in-progress commits had
    accidentally removed them; they're consumers of these helpers
    in lint/rules/{adapters,composition,core,gsap}.ts.

packages/cli/src/utils/lintProject.ts

  • New lintMissingLocalAsset rule: scans <video>/<img>/<source>
    src attributes for local files that don't exist in the project.
    Uses resolveExistingLocalAsset (same helper stylesheet lint
    uses) so the existence check matches the bundler's notion of
    "resolves" — handles root-absolute "/assets/foo.png" relative
    to projectDir, rejects "../outside.png" that escapes the
    project. The renderer otherwise 404s these silently and ships
    a video with missing visuals. Empirically the most common
    sub-agent mistake across multi-URL runs (~5+ per run).

packages/producer/build.mjs

  • ESM banner shims __dirname / __filename from import.meta.url
    alongside the existing createRequire shim. Bundled CJS deps —
    notably the ffmpeg/Emscripten wasm glue, which does
    `scriptDirectory = __dirname + "/"` — were crashing at render
    time with "__dirname is not defined in ES module scope".
    Verified by re-running a producer render after rebuild;
    ffmpeg pipeline completes cleanly.

All targeted tests pass (255/255 across capture / lint / lintProject
/ sfx unit tests). Typecheck clean for @hyperframes/core and
@hyperframes/cli. No happy-path behavior change; each fix targets
a specific observed failure.
---
 packages/cli/src/capture/assetCataloger.ts   |  41 +++-
 packages/cli/src/capture/assetDownloader.ts  |  60 +++++-
 packages/cli/src/capture/contentExtractor.ts | 102 ++++++++--
 packages/cli/src/capture/index.ts            |  13 +-
 packages/cli/src/capture/tokenExtractor.ts   |  39 +++-
 packages/cli/src/utils/lintProject.test.ts   | 192 +++++++++++++++++++
 packages/cli/src/utils/lintProject.ts        | 118 ++++++++++++
 packages/core/src/lint/utils.ts              |  32 +++-
 packages/producer/build.mjs                  |  15 +-
 9 files changed, 581 insertions(+), 31 deletions(-)
diff --git a/packages/cli/src/capture/assetCataloger.ts b/packages/cli/src/capture/assetCataloger.ts
index a9b3d3631..11306bee5 100644
--- a/packages/cli/src/capture/assetCataloger.ts
+++ b/packages/cli/src/capture/assetCataloger.ts
@@ -25,6 +25,12 @@ export interface CatalogedAsset {
   sectionClasses?: string;
   /** Whether the image is above the fold (visible without scrolling) */
   aboveFold?: boolean;
+  /** Element sits inside <header>, <nav>, or [role="banner"] — logo signal */
+  inBanner?: boolean;
+  /** Element sits inside <a> with site-root href ("/", "#", origin-only) — brand-home link */
+  inHomeLink?: boolean;
+  /** alt/aria-label/title contains the brand segment of document.title */
+  matchesTitleBrand?: boolean;
 }
 
 /**
@@ -62,6 +68,29 @@ export async function catalogAssets(page: Page): Promise<CatalogedAsset[]> {
         var rect = el.getBoundingClientRect();
         ctx.aboveFold = rect.top < window.innerHeight;
       } catch(e) {}
+      // Logo signals — surfaced explicitly so the downloader can prefix
+      // logo-<hash> reliably. Real-AI-test on heygen.com + huly.io showed
+      // the prior class-substring detector caught 0 logos; these explicit
+      // structural signals catch the header logo across modern React/
+      // Tailwind builds where "logo" isn't in any className.
+      // 1. inBanner: element sits inside <header>, <nav>, or [role=banner].
+      ctx.inBanner = el.closest('header, nav, [role="banner"]') !== null;
+      // 2. inHomeLink: element sits inside an <a> whose href is the site
+      //    root ("/", "#", "./" or origin-only URL) — the brand-home link.
+      var homeAnchor = el.closest('a[href]');
+      if (homeAnchor) {
+        var aHref = homeAnchor.getAttribute('href') || '';
+        ctx.inHomeLink = aHref === '/' || aHref === '#' || aHref === './' ||
+                         /^https?:\\/\\/[^/]+\\/?$/.test(aHref);
+      }
+      // 3. matchesTitleBrand: alt/aria-label/title contains the brand
+      //    segment of the page title (everything before " - " / " | " /
+      //    " — ") — the "alt=HeyGen" / "aria-label=Huly" pattern.
+      var titleBrand = (document.title || '').split(/[-|—]/)[0].trim();
+      if (desc && titleBrand.length > 1 && titleBrand.length < 30 &&
+          desc.toLowerCase().indexOf(titleBrand.toLowerCase()) !== -1) {
+        ctx.matchesTitleBrand = true;
+      }
       return ctx;
     }
 
@@ -92,12 +121,18 @@ export async function catalogAssets(page: Page): Promise<CatalogedAsset[]> {
       if (notes && !entry.notes) {
         entry.notes = notes;
       }
-      // Merge rich context (first one wins)
+      // Merge rich context. Text fields: first-occurrence wins. Boolean
+      // signals (inBanner / inHomeLink / matchesTitleBrand): any positive
+      // sample wins — if ANY DOM occurrence of this URL is in a header,
+      // the URL is a header-context asset.
       if (richCtx) {
         if (richCtx.description && !entry.description) entry.description = richCtx.description;
         if (richCtx.nearestHeading && !entry.nearestHeading) entry.nearestHeading = richCtx.nearestHeading;
         if (richCtx.sectionClasses && !entry.sectionClasses) entry.sectionClasses = richCtx.sectionClasses;
         if (richCtx.aboveFold !== undefined && entry.aboveFold === undefined) entry.aboveFold = richCtx.aboveFold;
+        if (richCtx.inBanner) entry.inBanner = true;
+        if (richCtx.inHomeLink) entry.inHomeLink = true;
+        if (richCtx.matchesTitleBrand) entry.matchesTitleBrand = true;
       }
     }
 
@@ -324,6 +359,10 @@ function deduplicateSrcsetVariants(assets: CatalogedAsset[]): CatalogedAsset[] {
       if (a.notes && !existing.notes) {
         existing.notes = a.notes;
       }
+      // Boolean logo signals: any positive sample wins through the merge.
+      if (a.inBanner) existing.inBanner = true;
+      if (a.inHomeLink) existing.inHomeLink = true;
+      if (a.matchesTitleBrand) existing.matchesTitleBrand = true;
       // Keep the URL with highest w= value (largest image)
       const existingW = getWidthParam(existing.url);
       const newW = getWidthParam(a.url);
diff --git a/packages/cli/src/capture/assetDownloader.ts b/packages/cli/src/capture/assetDownloader.ts
index 5a5b5afd9..ce462611a 100644
--- a/packages/cli/src/capture/assetDownloader.ts
+++ b/packages/cli/src/capture/assetDownloader.ts
@@ -7,9 +7,22 @@
 
 import { writeFileSync, mkdirSync } from "node:fs";
 import { join, extname } from "node:path";
+import { createHash } from "node:crypto";
 import type { DesignTokens, DownloadedAsset } from "./types.js";
 import type { CatalogedAsset } from "./assetCataloger.js";
 
+/**
+ * Content-hash slug for SVGs — `svg-<8-char-sha1>` for icons / `logo-<hash>`
+ * when DOM evidence says it's a logo. Replaces label-derived slugging which
+ * mis-assigned brand names to the wrong SVG bodies (e.g. `heygen-logo.svg`
+ * landing on the Google partner-logo SVG). The hash is a function of the
+ * bytes, so the filename can never mismatch the content.
+ */
+function svgContentHashSlug(svgSource: string | Buffer, isLogo: boolean): string {
+  const hash = createHash("sha1").update(svgSource).digest("hex").slice(0, 8);
+  return isLogo ? `logo-${hash}` : `svg-${hash}`;
+}
+
 export async function downloadAssets(
   tokens: DesignTokens,
   outputDir: string,
@@ -22,15 +35,20 @@ export async function downloadAssets(
   const assets: DownloadedAsset[] = [];
   const downloadedUrls = new Set<string>();
 
-  // 1. ALL inline SVGs — save as files (logos get priority naming)
+  // 1. ALL inline SVGs — save as files. Names are content-hash based
+  //    (`svg-<hash>.svg` or `logo-<hash>.svg`) so the filename can never
+  //    drift from the SVG body. The DOM-derived `label` is unreliable —
+  //    it has misassigned `heygen-logo.svg` to the Google partner SVG in
+  //    past captures because aria-label / nearest-heading inference can
+  //    pick up text from the wrong ancestor. Content-hash is invariant.
   mkdirSync(join(outputDir, "assets", "svgs"), { recursive: true });
   const usedSvgNames = new Set<string>();
   for (let i = 0; i < tokens.svgs.length && i < 30; i++) {
     const svg = tokens.svgs[i]!;
     if (!svg.outerHTML || svg.outerHTML.length < 50) continue;
-    const label = svg.label?.replace(/[^a-zA-Z0-9-_ ]/g, "").trim();
-    let slug = label ? slugify(label) : svg.isLogo ? `logo-${i}` : `icon-${i}`;
-    // Deduplicate — two SVGs with same aria-label get suffixed
+    const slug = svgContentHashSlug(svg.outerHTML, !!svg.isLogo);
+    // Hash collisions are negligible for 8-char sha1 prefix over <30 SVGs,
+    // but suffix-dedupe anyway for safety + idempotent re-runs.
     let finalSlug = slug;
     let suffix = 2;
     while (usedSvgNames.has(finalSlug)) {
@@ -135,8 +153,38 @@ export async function downloadAssets(
       if (result.status !== "fulfilled" || !result.value) continue;
       const { url, isPoster, parsedUrl, ext, buffer, catalog } = result.value;
       try {
-        // Generate human-readable name from catalog context
-        const slug = deriveAssetName(parsedUrl, catalog, isPoster, imgIdx, usedNames);
+        // SVGs use content-hash names because catalog-derived slugs
+        // mis-assigned brand names to the wrong SVG bodies (the same
+        // alignment failure that produced `heygen-logo.svg` containing
+        // the Google wordmark). Rasters keep the catalog-derived
+        // human-readable slug — they were not affected by the bug.
+        let slug: string;
+        if (ext === ".svg") {
+          // isLogo signals — broadened. The original `contexts` substring
+          // check never fired in practice because contexts hold HTML
+          // positions like 'img[src]' / 'video[poster]', not semantic
+          // labels. Real signals come from DOM structure + alt/aria text:
+          // 1. The cataloger now flags inBanner (inside <header>/<nav>/
+          //    [role=banner]), inHomeLink (inside <a href="/">), and
+          //    matchesTitleBrand (alt/aria matches document.title's
+          //    brand segment) — see assetCataloger.ts getElementContext.
+          // 2. As a backstop, also check description / nearestHeading /
+          //    sectionClasses for "logo" / "brand" / "wordmark" text.
+          const c = catalog;
+          const brandRe = /logo|brand|wordmark/i;
+          const isLogo = !!(
+            c?.inBanner ||
+            c?.inHomeLink ||
+            c?.matchesTitleBrand ||
+            c?.contexts?.some((s) => brandRe.test(s)) ||
+            (c?.description && brandRe.test(c.description)) ||
+            (c?.nearestHeading && brandRe.test(c.nearestHeading)) ||
+            (c?.sectionClasses && brandRe.test(c.sectionClasses))
+          );
+          slug = svgContentHashSlug(buffer, isLogo);
+        } else {
+          slug = deriveAssetName(parsedUrl, catalog, isPoster, imgIdx, usedNames);
+        }
         const name = `${slug}${ext}`;
         usedNames.add(slug);
         const localPath = `assets/${name}`;
diff --git a/packages/cli/src/capture/contentExtractor.ts b/packages/cli/src/capture/contentExtractor.ts
index edb69cc52..05d982d53 100644
--- a/packages/cli/src/capture/contentExtractor.ts
+++ b/packages/cli/src/capture/contentExtractor.ts
@@ -11,6 +11,7 @@
 import type { Page } from "puppeteer-core";
 import { existsSync, readdirSync, statSync, readFileSync } from "node:fs";
 import { join } from "node:path";
+import sharp from "sharp";
 import type { CatalogedAsset } from "./assetCataloger.js";
 import type { DesignTokens } from "./types.js";
 
@@ -232,7 +233,12 @@ export async function captionImagesWithGemini(
     }
     progress("design", `${Object.keys(geminiCaptions).length} images captioned with Gemini`);
 
-    // Caption SVGs by sending source code as text (vision API rejects image/svg+xml).
+    // Caption SVGs by RENDERING each to PNG via sharp first, then sending the
+    // PNG bytes to the Vision API — same call shape as raster images.
+    // Previous implementation sent SVG path markup as TEXT, which produced
+    // pure hallucinations on wordmarks (`hubspot-logo.svg` → "VIVIENNE",
+    // `huly-logo.svg` → "Kube", `workday.svg` → "wrestling"). Vision models
+    // can't reliably mental-render path commands; they need actual pixels.
     const svgFiles: Array<{ file: string; relPath: string }> = [];
     const assetsDir = join(outputDir, "assets");
     for (const f of readdirSync(assetsDir)) {
@@ -246,17 +252,49 @@ export async function captionImagesWithGemini(
     }
 
     if (svgFiles.length > 0) {
-      progress("design", `Captioning ${svgFiles.length} SVGs via code analysis...`);
+      progress("design", `Rasterizing + captioning ${svgFiles.length} SVGs via vision API...`);
       const SVG_BATCH = 20;
-      const MAX_SVG_CHARS = 10_000;
+      const SVG_RENDER_SIZE = 256; // px — enough resolution for Gemini to read wordmarks, small enough to keep payload sub-MB
       for (let i = 0; i < svgFiles.length; i += SVG_BATCH) {
         const batch = svgFiles.slice(i, i + SVG_BATCH);
         const results = await Promise.allSettled(
           batch.map(async ({ relPath }) => {
             const filePath = join(assetsDir, relPath);
-            let svgText = readFileSync(filePath, "utf-8");
-            if (svgText.length > MAX_SVG_CHARS) {
-              svgText = svgText.slice(0, MAX_SVG_CHARS) + "\n<!-- truncated -->";
+            let pngBase64: string;
+            try {
+              // Detect SVG fill polarity so we can pick a contrasting flatten
+              // background. White-glyph SVGs (huly's "✕ huly" wordmark uses
+              // fill="#fff") render invisible against white; dark-glyph SVGs
+              // render invisible against black. Choosing the background by
+              // dominant fill keeps both polarities readable for the vision API.
+              const svgSource = readFileSync(filePath, "utf-8");
+              const lightFillHits = (
+                svgSource.match(/fill\s*=\s*["'](#fff(fff)?|white|#f[ef][ef])["']/gi) || []
+              ).length;
+              const darkFillHits = (
+                svgSource.match(/fill\s*=\s*["'](#000(000)?|black|#[0-3]{6}|#[0-3]{3})["']/gi) || []
+              ).length;
+              const bg =
+                lightFillHits > darkFillHits
+                  ? { r: 32, g: 32, b: 32 } // dark slate behind light glyphs
+                  : { r: 255, g: 255, b: 255 }; // white behind dark glyphs (default)
+              // sharp rasterizes SVG → PNG natively.
+              const pngBuffer = await sharp(filePath)
+                .resize({
+                  width: SVG_RENDER_SIZE,
+                  height: SVG_RENDER_SIZE,
+                  fit: "inside",
+                  withoutEnlargement: false,
+                })
+                .flatten({ background: bg })
+                .png()
+                .toBuffer();
+              pngBase64 = pngBuffer.toString("base64");
+            } catch {
+              // SVG rasterization can fail on exotic features (external fonts,
+              // foreignObject, filters with missing primitives). Skip caption
+              // rather than block — agent will fall back to contact-sheet view.
+              return { file: relPath, caption: "" };
             }
             const response = await ai.models.generateContent({
               model,
@@ -264,12 +302,13 @@ export async function captionImagesWithGemini(
                 {
                   role: "user",
                   parts: [
+                    { inlineData: { mimeType: "image/png", data: pngBase64 } },
                     {
                       text:
-                        "This SVG code is from a website. Describe what it renders in ONE short sentence " +
-                        "for a video storyboard. Focus on: what shape/icon/illustration it is, its colors. " +
-                        "Be factual.\n\n" +
-                        svgText,
+                        "Describe this SVG asset rendered from a website in ONE short sentence for a video storyboard. " +
+                        "Focus on: what shape/icon/illustration/wordmark it is, its colors, any text it contains. " +
+                        "If you see a wordmark, READ THE LETTERS LITERALLY — do not guess a brand from context. " +
+                        "Be factual.",
                     },
                   ],
                 },
@@ -334,13 +373,28 @@ export function generateAssetDescriptions(
       const heading = catalogMatch?.nearestHeading || "";
       const section = catalogMatch?.sectionClasses || "";
       const aboveFold = catalogMatch?.aboveFold ? "above fold" : "";
+      // Logo signals — let the no-Gemini fallback still surface logos
+      // grep-ably even when Vision wasn't available to describe them.
+      const isLikelyLogo = !!(
+        catalogMatch?.inBanner ||
+        catalogMatch?.inHomeLink ||
+        catalogMatch?.matchesTitleBrand ||
+        /logo|brand|wordmark/i.test(desc) ||
+        /logo|brand|wordmark/i.test(section) ||
+        file.includes("logo")
+      );
       const geminiCaption = geminiCaptions[file];
       const cleanName = file.replace(/\.[^.]+$/, "").replace(/[-_]/g, " ");
       const parts = [`${file} — ${sizeKb}KB`];
       if (geminiCaption) {
+        // Even with Gemini's description, prepend the LOGO tag if
+        // structural signals fired — gives a stable grep target for
+        // agents searching for "the logo."
+        if (isLikelyLogo) parts.push("LOGO");
         parts.push(geminiCaption);
         captionedLines.push(parts.join(", "));
       } else {
+        if (isLikelyLogo) parts.push("LOGO");
         if (desc) parts.push(`"${desc.slice(0, 80)}"`);
         if (heading) parts.push(`section: "${heading.slice(0, 60)}"`);
         else if (section) parts.push(`in: ${section.split(" ").slice(0, 3).join(" ")}`);
@@ -358,11 +412,6 @@ export function generateAssetDescriptions(
     const svgsPath = join(assetsPath, "svgs");
     for (const file of readdirSync(svgsPath)) {
       if (!file.endsWith(".svg")) continue;
-      const geminiCaption = geminiCaptions[`svgs/${file}`];
-      if (geminiCaption) {
-        svgLines.push(`svgs/${file} — ${geminiCaption}`);
-        continue;
-      }
       const svgMatch = tokens.svgs.find(
         (s) =>
           s.label &&
@@ -373,9 +422,28 @@ export function generateAssetDescriptions(
               .slice(0, 15),
           ),
       );
+      // Filename prefix is now the most reliable logo signal: the
+      // capture pipeline names DOM-marked logos `logo-<hash>.svg` and
+      // everything else `svg-<hash>.svg`. Fall back to the tokens.svgs
+      // isLogo flag for legacy captures + a filename-includes-"logo"
+      // check for human-readable rasters.
+      //
+      // Compute this BEFORE the Gemini-caption branch so SVG logos that
+      // got Vision captions still receive the LOGO marker — without it
+      // an inline header `<svg>` named `logo-<hash>.svg` would land in
+      // asset-descriptions.md as plain text, defeating the LOGO grep.
+      const isLogo = file.startsWith("logo-") || svgMatch?.isLogo || file.includes("logo");
+      const geminiCaption = geminiCaptions[`svgs/${file}`];
+      if (geminiCaption) {
+        const prefix = isLogo ? "LOGO: " : "";
+        svgLines.push(`svgs/${file} — ${prefix}${geminiCaption}`);
+        continue;
+      }
       const label = svgMatch?.label || file.replace(".svg", "").replace(/-/g, " ");
-      const isLogo = svgMatch?.isLogo || file.includes("logo");
-      svgLines.push(`svgs/${file} — ${isLogo ? "logo: " : "icon: "}${label}`);
+      // Use uppercase "LOGO:" so agents can grep for it as a single,
+      // unambiguous token. The lowercase "logo:" prefix was easy to miss
+      // since real Vision captions also use the word casually.
+      svgLines.push(`svgs/${file} — ${isLogo ? "LOGO: " : "icon: "}${label}`);
     }
   } catch {
     /* no svgs dir */
diff --git a/packages/cli/src/capture/index.ts b/packages/cli/src/capture/index.ts
index 1903abbcc..9af063f07 100644
--- a/packages/cli/src/capture/index.ts
+++ b/packages/cli/src/capture/index.ts
@@ -579,14 +579,19 @@ export async function captureWebsite(
       const lines = generateAssetDescriptions(outputDir, tokens, catalogedAssets, geminiCaptions);
 
       if (lines.length > 0) {
+        const hasGeminiKey = !!(process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY);
+        const header = hasGeminiKey
+          ? "# Asset Descriptions\n\nOne line per file. Read this instead of opening every image individually. Lines tagged `LOGO` are the brand-mark candidates — search for the brand name here BEFORE composing a logo from scratch.\n\n"
+          : "# Asset Descriptions\n\n⚠️  GEMINI_API_KEY not set — descriptions below are catalog-derived (alt text, headings, section context, filename) instead of Vision-generated. Lines tagged `LOGO` are the brand-mark candidates per DOM-structural signals (inside header/nav, inside home anchor, or alt-text matching the page title). To get richer Vision descriptions on the next capture, set GEMINI_API_KEY (or GOOGLE_API_KEY) and re-run.\n\nWhen the description is too weak to identify a captured logo by description alone, open the LOGO-tagged SVGs in a previewer or `sharp`-render them to PNG before referencing — the alternative (composing a fake logo) ships off-brand in the final video.\n\n";
         writeFileSync(
           join(outputDir, "extracted", "asset-descriptions.md"),
-          "# Asset Descriptions\n\nOne line per file. Read this instead of opening every image individually.\n\n" +
-            lines.map((l) => "- " + l).join("\n") +
-            "\n",
+          header + lines.map((l) => "- " + l).join("\n") + "\n",
           "utf-8",
         );
-        progress("design", `${lines.length} asset descriptions written`);
+        progress(
+          "design",
+          `${lines.length} asset descriptions written${hasGeminiKey ? "" : " (no Gemini key — catalog-fallback mode)"}`,
+        );
       }
     } catch {
       /* non-critical */
diff --git a/packages/cli/src/capture/tokenExtractor.ts b/packages/cli/src/capture/tokenExtractor.ts
index 662585386..a23830756 100644
--- a/packages/cli/src/capture/tokenExtractor.ts
+++ b/packages/cli/src/capture/tokenExtractor.ts
@@ -353,6 +353,43 @@ const EXTRACT_SCRIPT = `(() => {
     // Keep SVGs that have a label OR are at least 16px wide OR are inside a logo/brand context
     var inLogoContext = svg.closest('[class*="logo"], [class*="brand"], [class*="partner"], [class*="customer"], [class*="marquee"]') !== null;
     if (!label && !inLogoContext && (!w || parseInt(w) < 16)) return null;
+    // isLogo — broadened detection. Real-AI-test on heygen.com + huly.io
+    // showed the original class-based detector caught 0/32 and 0/19 SVGs
+    // respectively — modern React/Tailwind builds don't put "logo" / "brand"
+    // anywhere in className. Three additional cheap heuristics added below.
+    var isLogo = (label && label.toLowerCase().indexOf("logo") !== -1) ||
+                 svg.closest('[class*="logo"], [class*="brand"], [class*="home"], [class*="marquee"], [class*="partner"], [class*="customer"]') !== null;
+    if (!isLogo) {
+      // (a) First SVG inside <header>, <nav>, or [role=banner] — the
+      // canonical "home logo at the top of the page" pattern.
+      var bannerEl = svg.closest('header, nav, [role="banner"]');
+      if (bannerEl) {
+        var firstSvg = bannerEl.querySelector('svg');
+        if (firstSvg === svg) isLogo = true;
+      }
+    }
+    if (!isLogo) {
+      // (b) SVG inside an <a> whose href is the site root — the brand-home
+      // link pattern (e.g. <a href="/"><svg>...</svg></a>).
+      var anchor = svg.closest('a[href]');
+      if (anchor) {
+        var href = anchor.getAttribute('href') || '';
+        if (href === '/' || href === '#' || href === './' ||
+            /^https?:\\/\\/[^/]+\\/?$/.test(href)) {
+          isLogo = true;
+        }
+      }
+    }
+    if (!isLogo) {
+      // (c) SVG with aria-label that matches the document title's brand
+      // segment (everything before " - " / " | " / " — ").
+      var ariaLabel = svg.getAttribute('aria-label') || svg.getAttribute('title') || '';
+      var titleBrand = (document.title || '').split(/[-|—]/)[0].trim();
+      if (titleBrand.length > 1 && titleBrand.length < 30 &&
+          ariaLabel.toLowerCase().indexOf(titleBrand.toLowerCase()) !== -1) {
+        isLogo = true;
+      }
+    }
     var rect = svg.getBoundingClientRect();
     return {
       label: label || undefined,
@@ -360,7 +397,7 @@ const EXTRACT_SCRIPT = `(() => {
       width: Math.round(rect.width),
       height: Math.round(rect.height),
       outerHTML: svg.outerHTML.slice(0, 10000),
-      isLogo: (label && label.toLowerCase().indexOf("logo") !== -1) || svg.closest('[class*="logo"], [class*="brand"], [class*="home"], [class*="marquee"], [class*="partner"], [class*="customer"]') !== null
+      isLogo: isLogo
     };
   }).filter(Boolean).slice(0, 50);
 
diff --git a/packages/cli/src/utils/lintProject.test.ts b/packages/cli/src/utils/lintProject.test.ts
index 157063d95..41171bdb8 100644
--- a/packages/cli/src/utils/lintProject.test.ts
+++ b/packages/cli/src/utils/lintProject.test.ts
@@ -475,6 +475,198 @@ describe("audio_src_not_found", () => {
   });
 });
 
+describe("missing_local_asset", () => {
+  it("errors when <img> src references a file that does not exist", async () => {
+    const html = `<html><body>
+  <div data-composition-id="main" data-width="1920" data-height="1080">
+    <img src="capture/assets/hero.png" />
+  </div>
+  <script>window.__timelines = window.__timelines || {}; window.__timelines["main"] = gsap.timeline({ paused: true });</script>
+</body></html>`;
+    const project = makeProject(html);
+
+    const { totalErrors, results } = await lintProject(project);
+
+    expect(totalErrors).toBeGreaterThan(0);
+    const finding = results[0]?.result.findings.find((f) => f.code === "missing_local_asset");
+    expect(finding).toBeDefined();
+    expect(finding?.severity).toBe("error");
+    expect(finding?.message).toContain("hero.png");
+    expect(finding?.message).toContain("<img>");
+  });
+
+  it("errors when <video> src references a missing file", async () => {
+    const html = `<html><body>
+  <div data-composition-id="main" data-width="1920" data-height="1080">
+    <video id="hero" src="capture/assets/videos/clip.mp4" muted playsinline></video>
+  </div>
+  <script>window.__timelines = window.__timelines || {}; window.__timelines["main"] = gsap.timeline({ paused: true });</script>
+</body></html>`;
+    const project = makeProject(html);
+
+    const { totalErrors, results } = await lintProject(project);
+
+    expect(totalErrors).toBeGreaterThan(0);
+    const finding = results[0]?.result.findings.find((f) => f.code === "missing_local_asset");
+    expect(finding).toBeDefined();
+    expect(finding?.message).toContain("clip.mp4");
+    expect(finding?.message).toContain("<video>");
+  });
+
+  it("errors when <source> src inside <video> references a missing file", async () => {
+    const html = `<html><body>
+  <div data-composition-id="main" data-width="1920" data-height="1080">
+    <video muted playsinline><source src="capture/assets/videos/clip.webm" /></video>
+  </div>
+  <script>window.__timelines = window.__timelines || {}; window.__timelines["main"] = gsap.timeline({ paused: true });</script>
+</body></html>`;
+    const project = makeProject(html);
+
+    const { totalErrors, results } = await lintProject(project);
+
+    expect(totalErrors).toBeGreaterThan(0);
+    const finding = results[0]?.result.findings.find((f) => f.code === "missing_local_asset");
+    expect(finding).toBeDefined();
+    expect(finding?.message).toContain("clip.webm");
+    expect(finding?.message).toContain("<source>");
+  });
+
+  it("does NOT report <audio> srcs (handled by audio_src_not_found)", async () => {
+    const html = `<html><body>
+  <div data-composition-id="main" data-width="1920" data-height="1080">
+    <audio id="vo" src="missing.mp3" data-start="0" data-duration="3" data-track-index="0" data-volume="1"></audio>
+  </div>
+  <script>window.__timelines = window.__timelines || {}; window.__timelines["main"] = gsap.timeline({ paused: true });</script>
+</body></html>`;
+    const project = makeProject(html);
+
+    const { results } = await lintProject(project);
+
+    const localAsset = results[0]?.result.findings.find((f) => f.code === "missing_local_asset");
+    const audio = results[0]?.result.findings.find((f) => f.code === "audio_src_not_found");
+    expect(localAsset).toBeUndefined();
+    expect(audio).toBeDefined();
+  });
+
+  it("does NOT report remote URLs (https:, data:, blob:)", async () => {
+    const html = `<html><body>
+  <div data-composition-id="main" data-width="1920" data-height="1080">
+    <img src="https://example.com/x.png" />
+    <img src="data:image/png;base64,iVBOR" />
+    <img src="blob:foo" />
+  </div>
+  <script>window.__timelines = window.__timelines || {}; window.__timelines["main"] = gsap.timeline({ paused: true });</script>
+</body></html>`;
+    const project = makeProject(html);
+
+    const { results } = await lintProject(project);
+
+    const finding = results[0]?.result.findings.find((f) => f.code === "missing_local_asset");
+    expect(finding).toBeUndefined();
+  });
+
+  it("does NOT report template placeholders (__VIDEO_SRC__)", async () => {
+    const html = `<html><body>
+  <div data-composition-id="main" data-width="1920" data-height="1080">
+    <video src="__VIDEO_SRC__"></video>
+  </div>
+  <script>window.__timelines = window.__timelines || {}; window.__timelines["main"] = gsap.timeline({ paused: true });</script>
+</body></html>`;
+    const project = makeProject(html);
+
+    const { results } = await lintProject(project);
+
+    const finding = results[0]?.result.findings.find((f) => f.code === "missing_local_asset");
+    expect(finding).toBeUndefined();
+  });
+
+  it("does not error when referenced files exist on disk", async () => {
+    const html = `<html><body>
+  <div data-composition-id="main" data-width="1920" data-height="1080">
+    <img src="hero.png" />
+    <video src="clip.mp4"></video>
+  </div>
+  <script>window.__timelines = window.__timelines || {}; window.__timelines["main"] = gsap.timeline({ paused: true });</script>
+</body></html>`;
+    const project = makeProject(html);
+    writeFileSync(join(project.dir, "hero.png"), "fake");
+    writeFileSync(join(project.dir, "clip.mp4"), "fake");
+
+    const { results } = await lintProject(project);
+
+    const finding = results[0]?.result.findings.find((f) => f.code === "missing_local_asset");
+    expect(finding).toBeUndefined();
+  });
+
+  it("resolves sub-composition relative paths (../assets/foo.png)", async () => {
+    const subComp = `<html><body>
+  <div data-composition-id="scene" data-width="1920" data-height="1080">
+    <img src="../assets/foo.png" />
+  </div>
+  <script>window.__timelines = window.__timelines || {}; window.__timelines["scene"] = gsap.timeline({ paused: true });</script>
+</body></html>`;
+    const project = makeProject(validHtml(), { "scene.html": subComp });
+    mkdirSync(join(project.dir, "assets"), { recursive: true });
+    writeFileSync(join(project.dir, "assets", "foo.png"), "fake");
+
+    const { results } = await lintProject(project);
+
+    const finding = results[0]?.result.findings.find((f) => f.code === "missing_local_asset");
+    expect(finding).toBeUndefined();
+  });
+
+  it("deduplicates the same missing src across multiple compositions", async () => {
+    const project = makeProject(
+      `<html><body>
+  <div data-composition-id="main" data-width="1920" data-height="1080">
+    <img src="capture/assets/x.png" />
+  </div>
+  <script>window.__timelines = window.__timelines || {}; window.__timelines["main"] = gsap.timeline({ paused: true });</script>
+</body></html>`,
+      {
+        "scene-a.html": `<html><body>
+  <div data-composition-id="a" data-width="1920" data-height="1080">
+    <img src="../capture/assets/x.png" />
+  </div>
+  <script>window.__timelines = window.__timelines || {}; window.__timelines["a"] = gsap.timeline({ paused: true });</script>
+</body></html>`,
+        "scene-b.html": `<html><body>
+  <div data-composition-id="b" data-width="1920" data-height="1080">
+    <img src="../capture/assets/x.png" />
+  </div>
+  <script>window.__timelines = window.__timelines || {}; window.__timelines["b"] = gsap.timeline({ paused: true });</script>
+</body></html>`,
+      },
+    );
+
+    const { results } = await lintProject(project);
+
+    const finding = results[0]?.result.findings.find((f) => f.code === "missing_local_asset");
+    expect(finding).toBeDefined();
+    // x.png mentioned only once despite three references
+    const occurrences = (finding?.message.match(/x\.png/g) ?? []).length;
+    expect(occurrences).toBe(1);
+  });
+
+  it("emits separate findings per tag type (img + video) for clear messaging", async () => {
+    const html = `<html><body>
+  <div data-composition-id="main" data-width="1920" data-height="1080">
+    <img src="missing.png" />
+    <video src="missing.mp4"></video>
+  </div>
+  <script>window.__timelines = window.__timelines || {}; window.__timelines["main"] = gsap.timeline({ paused: true });</script>
+</body></html>`;
+    const project = makeProject(html);
+
+    const { results } = await lintProject(project);
+
+    const findings = results[0]?.result.findings.filter((f) => f.code === "missing_local_asset");
+    expect(findings).toHaveLength(2);
+    expect(findings?.some((f) => f.message.includes("<img>"))).toBe(true);
+    expect(findings?.some((f) => f.message.includes("<video>"))).toBe(true);
+  });
+});
+
 describe("texture_mask_asset_not_found", () => {
   it("errors when CSS mask-image references a missing local texture", async () => {
     const html = `<html><body>
diff --git a/packages/cli/src/utils/lintProject.ts b/packages/cli/src/utils/lintProject.ts
index d2b69c68b..6f1ce9875 100644
--- a/packages/cli/src/utils/lintProject.ts
+++ b/packages/cli/src/utils/lintProject.ts
@@ -223,6 +223,7 @@ export async function lintProject(project: ProjectDir): Promise<ProjectLintResul
   const projectFindings = [
     ...lintProjectAudioFiles(project.dir, allHtmlSources),
     ...lintAudioSrcNotFound(project.dir, allHtmlSources),
+    ...lintMissingLocalAsset(project.dir, allHtmlSources),
     ...lintTextureMaskAssetNotFound(project.dir, allHtmlSources),
     ...lintMultipleRootCompositions(project.dir),
     ...lintDuplicateAudioTracks(allHtmlSources),
@@ -337,6 +338,123 @@ function lintAudioSrcNotFound(
   return findings;
 }
 
+/**
+ * Replace every match of `pattern` with same-length whitespace so byte offsets
+ * stay aligned. Used to hide HTML comments / <style> / <script> blocks before
+ * scanning the markup for live element references.
+ */
+function maskRange(src: string, pattern: RegExp): string {
+  return src.replace(pattern, (m) => " ".repeat(m.length));
+}
+
+/**
+ * Mask `<!-- … -->`, `<style>…</style>`, and `<script>…</script>` so a literal
+ * `<img src="x.png">` written inside a tutorial comment or an example string
+ * inside a script tag isn't reported as a real asset reference.
+ */
+function maskNonScannableRanges(html: string): string {
+  // Closing tags are parsed permissively — browsers accept `</script>`,
+  // `</script  >`, `</script\t\nbar>`, etc. Match `</tag` followed by anything
+  // up to the next `>` so all those forms get masked (the trailing junk is
+  // ignored by HTML parsers but a stricter regex would leak the inner content
+  // back into the scan window). CodeQL js/bad-tag-filter requires this shape.
+  let out = maskRange(html, /<!--[\s\S]*?-->/g);
+  out = maskRange(out, /<style\b[^>]*>[\s\S]*?<\/style\b[^>]*>/gi);
+  out = maskRange(out, /<script\b[^>]*>[\s\S]*?<\/script\b[^>]*>/gi);
+  return out;
+}
+
+/**
+ * Check for <video>, <img>, and <source> elements whose src points to a local
+ * file that doesn't exist in the project. Mirrors lintAudioSrcNotFound but for
+ * visual assets. The renderer 404s silently and produces a video with the asset
+ * just missing from frame — no error surfaced, no log.
+ *
+ * Empirically the most common sub-agent mistake per references/step-5-build.md
+ * ASSET PATHS section (~5+ agents per multi-URL run). Combined with the fact
+ * that captured asset filenames are unreliable (heygen-logo.svg may contain
+ * Google), this rule catches the proximate cause of broken-asset renders.
+ *
+ * <audio> is handled separately by lintAudioSrcNotFound — skip here to avoid
+ * double-reporting and to preserve audio's tailored "silent video" message.
+ */
+// fallow-ignore-next-line complexity
+function lintMissingLocalAsset(
+  projectDir: string,
+  htmlSources: HtmlSource[],
+): HyperframeLintFinding[] {
+  const findings: HyperframeLintFinding[] = [];
+
+  const localAssetSrcRe = /<(video|img|source)\b[^>]*\bsrc\s*=\s*["']([^"']+)["'][^>]*>/gi;
+
+  // tagName → Map<resolvedAbsPath, representativeSrc>. We dedup by RESOLVED
+  // path so that the same missing file referenced as `capture/x.png` from
+  // root and `../capture/x.png` from a sub-comp produces ONE finding, not two.
+  // The representative src is whichever src the resolver saw first — that's
+  // what surfaces in the user-facing message.
+  const missingByTag = new Map<string, Map<string, string>>();
+
+  for (const { html, compSrcPath } of htmlSources) {
+    // Mask comments, <style>, and <script> ranges so a commented-out or
+    // example `<img src="missing.png">` isn't reported as a real missing asset.
+    // Spaces preserve offsets so reported indices still match the source.
+    const scannable = maskNonScannableRanges(html);
+    const re = new RegExp(localAssetSrcRe.source, localAssetSrcRe.flags);
+    let match: RegExpExecArray | null;
+    while ((match = re.exec(scannable)) !== null) {
+      const tagName = (match[1] ?? "").toLowerCase();
+      const rawSrc = match[2] ?? "";
+      const src = cleanAssetUrl(rawSrc);
+      if (!src) continue;
+      if (isRemoteOrInlineUrl(src)) continue;
+      if (/^__[A-Z_]+__$/.test(src)) continue; // template placeholder
+      // Sub-composition srcs are written relative to the sub-comp file
+      // (e.g. "../assets/foo.png"); the bundler rewrites them to root-relative
+      // before serving. Mirror that rewrite so the existence check sees the
+      // same path the renderer will. Root-html srcs pass through unchanged.
+      const rootRelative = compSrcPath ? rewriteAssetPath(compSrcPath, src) : src;
+      // Use resolveExistingLocalAsset (same helper that stylesheet resolution
+      // uses above) so this rule's notion of "exists" matches the bundler's:
+      // it handles root-absolute "/assets/foo.png" (treated relative to
+      // projectDir, NOT to the host filesystem) and rejects "../outside.png"
+      // that escapes the project — both of which a raw `resolve(projectDir,
+      // rootRelative) + existsSync` would mis-handle (false-positive misses
+      // for the first, false-positive existence for the second if a same-
+      // named file happens to live on the host).
+      const resolvedAsset = resolveExistingLocalAsset(projectDir, rootRelative);
+      if (resolvedAsset) continue;
+
+      const resolvedKey = resolve(projectDir, rootRelative);
+      let bucket = missingByTag.get(tagName);
+      if (!bucket) {
+        bucket = new Map<string, string>();
+        missingByTag.set(tagName, bucket);
+      }
+      if (!bucket.has(resolvedKey)) bucket.set(resolvedKey, src);
+    }
+  }
+
+  for (const [tagName, byResolved] of missingByTag) {
+    const unique = [...byResolved.values()];
+    findings.push({
+      code: "missing_local_asset",
+      severity: "error",
+      message:
+        `<${tagName}> element references local file(s) not found in the project: ${unique.join(", ")}. ` +
+        "The renderer will silently skip these and produce a video with missing visuals.",
+      fixHint:
+        unique.length === 1
+          ? `Add "${unique[0]}" to the project directory, or update the src attribute to point to an existing file. ` +
+            "Common cause: captured asset filenames are unreliable (heygen-logo.svg often contains Google, nvidia-logo.svg may contain Autodesk, etc.). " +
+            "Open the contact sheets and verify the file actually exists at this path before referencing it."
+          : "Add the missing files to the project directory, or update the src attributes to point to existing files. " +
+            "Captured asset filenames are unreliable — verify against capture/contact-sheets/ and capture/extracted/asset-descriptions.md.",
+    });
+  }
+
+  return findings;
+}
+
 function lintTextureMaskAssetNotFound(
   projectDir: string,
   htmlSources: HtmlSource[],
diff --git a/packages/core/src/lint/utils.ts b/packages/core/src/lint/utils.ts
index 2030a0658..fc69d6876 100644
--- a/packages/core/src/lint/utils.ts
+++ b/packages/core/src/lint/utils.ts
@@ -79,6 +79,28 @@ export function findHtmlTag(source: string): OpenTag | null {
   };
 }
 
+/**
+ * Replace ranges of `src` matched by `pattern` with same-length space runs so
+ * downstream regex offsets stay stable but the matched content is invisible
+ * to subsequent tag extraction. Used to neutralize comments / style+script
+ * block contents that could otherwise look like real opening tags (e.g.
+ * `/* example: <video> *​/` inside a <style> block would be matched as a
+ * root tag by the bare TAG_PATTERN — observed on a real heygen-showcase run
+ * where a beat shipped two false-positive `root_missing_*` lint errors).
+ */
+function maskRanges(src: string, pattern: RegExp): string {
+  const p = new RegExp(
+    pattern.source,
+    pattern.flags.includes("g") ? pattern.flags : pattern.flags + "g",
+  );
+  let out = src;
+  let m: RegExpExecArray | null;
+  while ((m = p.exec(out)) !== null) {
+    out = out.slice(0, m.index) + " ".repeat(m[0].length) + out.slice(m.index + m[0].length);
+  }
+  return out;
+}
+
 export function findRootTag(source: string): OpenTag | null {
   const bodyOpenMatch = /<body\b[^>]*>/i.exec(source);
   const bodyCloseMatch = /<\/body>/i.exec(source);
@@ -88,7 +110,15 @@ export function findRootTag(source: string): OpenTag | null {
       ? bodyCloseMatch.index
       : source.length;
   const bodyContent = bodyOpenMatch ? source.slice(bodyStart, bodyEnd) : source;
-  const bodyTags = extractOpenTags(bodyContent);
+  // Mask comment / style / script ranges so a `<video>` token written
+  // inside a CSS or HTML comment doesn't get picked as the composition's
+  // root tag. Same-length space substitution preserves offsets so the
+  // returned tag's `.index` still points at the real position in `source`.
+  const masked = maskRanges(
+    maskRanges(maskRanges(bodyContent, /<!--[\s\S]*?-->/g), STYLE_BLOCK_PATTERN),
+    SCRIPT_BLOCK_PATTERN,
+  );
+  const bodyTags = extractOpenTags(masked);
   for (const tag of bodyTags) {
     if (["script", "style", "meta", "link", "title"].includes(tag.name)) continue;
     return { ...tag, index: tag.index + bodyStart };
diff --git a/packages/producer/build.mjs b/packages/producer/build.mjs
index 22fceae09..62329a306 100644
--- a/packages/producer/build.mjs
+++ b/packages/producer/build.mjs
@@ -19,8 +19,21 @@ const scriptDir = dirname(fileURLToPath(import.meta.url));
 // esbuild's CJS interop (__require) works correctly in ESM output.
 // Without this, bundled CJS deps (recast, yauzl, etc.) that call
 // require("fs") throw "Dynamic require of 'fs' is not supported".
+//
+// It also reconstructs the CommonJS `__dirname` / `__filename` globals from
+// import.meta.url. Bundled CJS deps (notably the ffmpeg/Emscripten wasm glue,
+// which does `scriptDirectory = __dirname + "/"`) reference bare `__dirname`,
+// which does not exist in ESM scope — without this shim they throw
+// "__dirname is not defined in ES module scope" at render time.
 const cjsBanner = {
-  js: "import { createRequire as __cjsRequire } from 'module'; const require = __cjsRequire(import.meta.url);",
+  js: [
+    "import { createRequire as __cjsRequire } from 'module';",
+    "import { fileURLToPath as __fileURLToPath } from 'url';",
+    "import { dirname as __pathDirname } from 'path';",
+    "const require = __cjsRequire(import.meta.url);",
+    "const __filename = __fileURLToPath(import.meta.url);",
+    "const __dirname = __pathDirname(__filename);",
+  ].join(" "),
 };
 
 const workspaceAliasPlugin = {

From 9db1b9d31401da873d4f3aebdcad0063960862d2 Mon Sep 17 00:00:00 2001
From: ukimsanov <ular.kimsanov@heygen.com>
Date: Sun, 14 Jun 2026 00:44:47 -0700
Subject: [PATCH 2/2] feat(cli): capture-video on-demand fetcher from capture
 manifest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds `hyperframes capture-video <project>` which downloads a single
video from the capture's video-manifest.json on demand. The capture
pipeline writes the manifest + preview PNGs but deliberately skips
the mp4s — a site with 12+ feature videos would balloon the capture
from ~5 MB to hundreds of MB. This command pulls one entry at a
time, addressed by --index N (matched against the manifest entry's
own `index` field, not array offset — the manifest can have gaps).

  • SSRF-safe via `safeFetch` from capture/assetDownloader (rejects
    private/metadata hosts, re-validates redirects).
  • Content-type whitelist: `video/*` + a small set of common
    `application/*` variants. Anything else (HTML error pages,
    JSON, tracking pixels) aborts cleanly.
  • 250 MB hard cap on Content-Length AND body size.
  • Filename sanitization: percent-decoded then anything outside
    [A-Za-z0-9._-] stripped.
  • Race-free write: `flag: "wx"` atomic exclusive-create with
    EEXIST handled as 'already downloaded' (no upstream existsSync
    precheck — eliminates the TOCTOU pattern).
  • Dual-layout aware: checks `<dir>/extracted/` (standalone
    capture) and `<dir>/capture/extracted/` (W2H project layout).
  • Suggested embed snippet includes `id="video-${entry.index}"`
    so the output passes the producer's media discovery / lint.

Registered in cli.ts + help.ts.
---
 packages/cli/src/cli.ts                       |   1 +
 .../cli/src/commands/capture-video.test.ts    | 140 +++++++++
 packages/cli/src/commands/capture-video.ts    | 289 ++++++++++++++++++
 packages/cli/src/help.ts                      |   1 +
 4 files changed, 431 insertions(+)
 create mode 100644 packages/cli/src/commands/capture-video.test.ts
 create mode 100644 packages/cli/src/commands/capture-video.ts

diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts
index 372d13bfd..ec9690a6a 100644
--- a/packages/cli/src/cli.ts
+++ b/packages/cli/src/cli.ts
@@ -135,6 +135,7 @@ const subCommands = {
   validate: () => import("./commands/validate.js").then((m) => m.default),
   snapshot: () => import("./commands/snapshot.js").then((m) => m.default),
   capture: () => import("./commands/capture.js").then((m) => m.default),
+  "capture-video": () => import("./commands/capture-video.js").then((m) => m.default),
   lambda: () => import("./commands/lambda.js").then((m) => m.default),
   cloudrun: () => import("./commands/cloudrun.js").then((m) => m.default),
   cloud: () => import("./commands/cloud.js").then((m) => m.default),
diff --git a/packages/cli/src/commands/capture-video.test.ts b/packages/cli/src/commands/capture-video.test.ts
new file mode 100644
index 000000000..f7210c356
--- /dev/null
+++ b/packages/cli/src/commands/capture-video.test.ts
@@ -0,0 +1,140 @@
+import { describe, expect, it } from "vitest";
+import {
+  MAX_VIDEO_BYTES,
+  VIDEO_CONTENT_TYPE_RE,
+  pickManifestEntry,
+  safeFilename,
+  type ManifestEntry,
+} from "./capture-video.js";
+
+const ENTRY = (index: number, partial: Partial<ManifestEntry> = {}): ManifestEntry => ({
+  index,
+  url: `https://cdn.example.com/video-${index}.mp4`,
+  filename: `video-${index}.mp4`,
+  width: 1920,
+  height: 1080,
+  heading: "",
+  caption: "",
+  ariaLabel: "",
+  preview: `assets/videos/previews/video-${index}-preview.png`,
+  ...partial,
+});
+
+describe("safeFilename", () => {
+  it("decodes percent-encoded chars", () => {
+    expect(safeFilename("Frame-2147227325%20(1).mp4")).toBe("Frame-2147227325_1_.mp4");
+  });
+
+  it("strips characters outside [A-Za-z0-9._-]", () => {
+    expect(safeFilename("video with spaces & symbols!.mp4")).toBe("video_with_spaces_symbols_.mp4");
+  });
+
+  it("preserves the extension and version markers", () => {
+    expect(safeFilename("hero.mp4")).toBe("hero.mp4");
+    expect(safeFilename("hero-v2.webm")).toBe("hero-v2.webm");
+  });
+
+  it("falls back when decodeURIComponent throws on a malformed sequence", () => {
+    // `%E0%A4%A` is a truncated UTF-8 multibyte sequence and throws URIError.
+    // We should keep the raw input rather than crashing.
+    expect(safeFilename("Bad%E0%A4%A.mp4")).toBe("Bad_E0_A4_A.mp4");
+  });
+
+  it("collapses runs of disallowed characters into a single underscore", () => {
+    expect(safeFilename("a   b___c")).toBe("a_b___c");
+  });
+});
+
+describe("VIDEO_CONTENT_TYPE_RE", () => {
+  it("matches common video content-types", () => {
+    expect(VIDEO_CONTENT_TYPE_RE.test("video/mp4")).toBe(true);
+    expect(VIDEO_CONTENT_TYPE_RE.test("video/webm")).toBe(true);
+    expect(VIDEO_CONTENT_TYPE_RE.test("video/quicktime")).toBe(true);
+  });
+
+  it("matches application/* containers that CDNs commonly use", () => {
+    expect(VIDEO_CONTENT_TYPE_RE.test("application/mp4")).toBe(true);
+    expect(VIDEO_CONTENT_TYPE_RE.test("application/octet-stream")).toBe(true);
+    expect(VIDEO_CONTENT_TYPE_RE.test("application/x-mpegURL")).toBe(true);
+  });
+
+  it("rejects HTML / JSON error pages that pretend to be videos", () => {
+    expect(VIDEO_CONTENT_TYPE_RE.test("text/html")).toBe(false);
+    expect(VIDEO_CONTENT_TYPE_RE.test("application/json")).toBe(false);
+    expect(VIDEO_CONTENT_TYPE_RE.test("image/png")).toBe(false);
+  });
+});
+
+describe("MAX_VIDEO_BYTES", () => {
+  it("is 250 MB", () => {
+    expect(MAX_VIDEO_BYTES).toBe(250 * 1024 * 1024);
+  });
+});
+
+describe("pickManifestEntry", () => {
+  it("returns no-selector when neither --index nor --url is given", () => {
+    const r = pickManifestEntry([ENTRY(0)], {});
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.code).toBe("no-selector");
+  });
+
+  it("looks up by the entry's `index` field, NOT array offset (manifest gaps)", () => {
+    // Captures with failed-preview entries skip the push, so `index` can have
+    // gaps. A user running `--index 3` after seeing `[3] hero` in --list
+    // should get the entry whose `index === 3`, even if it's not array[3].
+    const manifest = [ENTRY(0), ENTRY(2), ENTRY(3)]; // index 1 missing
+    const r = pickManifestEntry(manifest, { index: 3 });
+    expect(r.ok).toBe(true);
+    if (r.ok) expect(r.entry.url).toBe("https://cdn.example.com/video-3.mp4");
+  });
+
+  it("rejects a request for an index that's not in the manifest", () => {
+    const manifest = [ENTRY(0), ENTRY(2)]; // index 1 missing
+    const r = pickManifestEntry(manifest, { index: 1 });
+    expect(r.ok).toBe(false);
+    if (!r.ok) {
+      expect(r.code).toBe("no-match-index");
+      expect(r.message).toContain("index=1");
+      expect(r.message).toContain("available: 0, 2");
+    }
+  });
+
+  it("rejects a negative or non-integer index up front", () => {
+    expect(pickManifestEntry([ENTRY(0)], { index: -1 }).ok).toBe(false);
+    expect(pickManifestEntry([ENTRY(0)], { index: 1.5 }).ok).toBe(false);
+    expect(pickManifestEntry([ENTRY(0)], { index: "abc" }).ok).toBe(false);
+  });
+
+  it("accepts numeric-string indices (citty parses positional args as strings)", () => {
+    const r = pickManifestEntry([ENTRY(0), ENTRY(1)], { index: "1" });
+    expect(r.ok).toBe(true);
+    if (r.ok) expect(r.entry.index).toBe(1);
+  });
+
+  it("looks up by exact URL match", () => {
+    const manifest = [ENTRY(0), ENTRY(1)];
+    const r = pickManifestEntry(manifest, { url: "https://cdn.example.com/video-1.mp4" });
+    expect(r.ok).toBe(true);
+    if (r.ok) expect(r.entry.index).toBe(1);
+  });
+
+  it("rejects a URL that doesn't appear in the manifest", () => {
+    const manifest = [ENTRY(0)];
+    const r = pickManifestEntry(manifest, { url: "https://other.com/missing.mp4" });
+    expect(r.ok).toBe(false);
+    if (!r.ok) {
+      expect(r.code).toBe("no-match-url");
+      expect(r.message).toContain("missing.mp4");
+    }
+  });
+
+  it("when both --index and --url are passed, --index wins (CLI's declared priority)", () => {
+    const manifest = [ENTRY(0), ENTRY(1)];
+    const r = pickManifestEntry(manifest, {
+      index: 1,
+      url: "https://cdn.example.com/video-0.mp4",
+    });
+    expect(r.ok).toBe(true);
+    if (r.ok) expect(r.entry.index).toBe(1);
+  });
+});
diff --git a/packages/cli/src/commands/capture-video.ts b/packages/cli/src/commands/capture-video.ts
new file mode 100644
index 000000000..6ad8ac9b7
--- /dev/null
+++ b/packages/cli/src/commands/capture-video.ts
@@ -0,0 +1,289 @@
+import { defineCommand } from "citty";
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
+import { resolve, join, basename } from "node:path";
+import { c } from "../ui/colors.js";
+import { safeFetch } from "../capture/assetDownloader.js";
+import type { Example } from "./_examples.js";
+
+export const examples: Example[] = [
+  [
+    "Download the hero video (index 0) from a captured project's manifest",
+    "capture-video ./my-project --index 0",
+  ],
+  [
+    "Download a specific video by exact URL",
+    "capture-video ./my-project --url https://cdn.example.com/hero.mp4",
+  ],
+  ["List entries in the manifest without downloading", "capture-video ./my-project --list"],
+];
+
+/**
+ * The capture pipeline writes capture/extracted/video-manifest.json listing
+ * every <video> element on the source page (URL, dimensions, heading, caption,
+ * preview PNG) but deliberately does NOT download the mp4s — sites with
+ * dozens of feature videos would balloon the capture size to hundreds of MB.
+ *
+ * This command lets agents pull just the ONE video a beat needs (e.g.
+ * heygen.com's "Orb" hero animation at index 0) on demand. The downloaded
+ * file lands at capture/assets/videos/<filename-from-manifest> so beat
+ * compositions can reference it as `capture/assets/videos/<filename>` —
+ * same pattern as the captured SVGs and rasters.
+ */
+
+/** Max video size (250 MB) — a long social/marketing reel rarely exceeds 50 MB
+ * but allow headroom for higher-quality masters. Anything above this is almost
+ * certainly a misconfigured manifest URL pointing at the wrong asset.
+ */
+const MAX_VIDEO_BYTES = 250 * 1024 * 1024;
+
+/** Whitelist of content-types we accept. Anything else (XML/HTML error pages,
+ * tracking pixels, JSON, etc.) means the URL didn't actually point at a video.
+ */
+const VIDEO_CONTENT_TYPE_RE = /^(video\/|application\/(mp4|octet-stream|x-mpegurl))/i;
+
+// fallow-ignore-next-line complexity
+async function fetchToBuffer(url: string): Promise<Buffer> {
+  // safeFetch enforces SSRF protection (rejects private/metadata hosts AND
+  // re-validates on every redirect hop — a bare redirect:"follow" only checks
+  // the initial URL, so a public URL can 30x to 169.254.169.254 etc.).
+  const r = await safeFetch(url, {
+    signal: AbortSignal.timeout(120_000),
+    headers: { "User-Agent": "HyperFrames/1.0" },
+  });
+  if (!r) {
+    throw new Error(
+      `fetch blocked or failed (private/metadata host, redirect chain, or network error): ${url}`,
+    );
+  }
+  if (!r.ok) throw new Error(`HTTP ${r.status} ${r.statusText} for ${url}`);
+  const ct = r.headers.get("content-type") || "";
+  if (!VIDEO_CONTENT_TYPE_RE.test(ct)) {
+    throw new Error(
+      `unexpected content-type "${ct}" for ${url} — expected video/*. The URL probably doesn't point at a real video file.`,
+    );
+  }
+  const cl = r.headers.get("content-length");
+  if (cl && Number(cl) > MAX_VIDEO_BYTES) {
+    throw new Error(
+      `video too large (${Math.round(Number(cl) / 1024 / 1024)}MB > ${Math.round(MAX_VIDEO_BYTES / 1024 / 1024)}MB cap) for ${url}`,
+    );
+  }
+  const ab = await r.arrayBuffer();
+  if (ab.byteLength > MAX_VIDEO_BYTES) {
+    throw new Error(
+      `video body exceeds ${Math.round(MAX_VIDEO_BYTES / 1024 / 1024)}MB cap (got ${Math.round(ab.byteLength / 1024 / 1024)}MB) for ${url}`,
+    );
+  }
+  return Buffer.from(ab);
+}
+
+export function safeFilename(name: string): string {
+  // Manifest filenames sometimes carry URL-encoded chars (e.g.
+  // "Frame-2147227325%20(1).mp4"). Decode and replace anything that
+  // could be hostile on disk while keeping the extension intact.
+  let decoded = name;
+  try {
+    decoded = decodeURIComponent(name);
+  } catch {
+    /* keep raw */
+  }
+  return decoded.replace(/[^A-Za-z0-9._-]+/g, "_");
+}
+
+/** Exported for tests — matches the content types a real video URL returns. */
+export { VIDEO_CONTENT_TYPE_RE, MAX_VIDEO_BYTES };
+
+export interface ManifestEntry {
+  index: number;
+  url: string;
+  filename: string;
+  width: number;
+  height: number;
+  heading: string;
+  caption: string;
+  ariaLabel: string;
+  preview: string;
+}
+
+export type PickResult =
+  | { ok: true; entry: ManifestEntry }
+  | {
+      ok: false;
+      code: "no-selector" | "bad-index" | "no-match-index" | "no-match-url";
+      message: string;
+    };
+
+/**
+ * Pick a manifest entry from `--index N` or `--url URL`. Looks up by the
+ * entry's own `index` field (NOT array offset — `captureVideoManifest` can
+ * skip videos whose preview screenshot fails, leaving gaps in `index`).
+ */
+export function pickManifestEntry(
+  manifest: ManifestEntry[],
+  args: { index?: string | number | null; url?: string | null },
+): PickResult {
+  if (args.index != null) {
+    const i = Number(args.index);
+    if (!Number.isInteger(i) || i < 0) {
+      return {
+        ok: false,
+        code: "bad-index",
+        message: `--index ${args.index} must be a non-negative integer`,
+      };
+    }
+    const found = manifest.find((e) => e.index === i);
+    if (!found) {
+      const available = manifest.map((e) => e.index).join(", ");
+      return {
+        ok: false,
+        code: "no-match-index",
+        message: `no manifest entry with index=${i} (available: ${available || "none"})`,
+      };
+    }
+    return { ok: true, entry: found };
+  }
+  if (args.url != null) {
+    const found = manifest.find((e) => e.url === args.url);
+    if (!found) {
+      return { ok: false, code: "no-match-url", message: `no manifest entry with url=${args.url}` };
+    }
+    return { ok: true, entry: found };
+  }
+  return {
+    ok: false,
+    code: "no-selector",
+    message: "specify --index <N> or --url <URL> (or --list to see what's in the manifest)",
+  };
+}
+
+export default defineCommand({
+  meta: {
+    name: "capture-video",
+    description:
+      "Download a video referenced in capture/extracted/video-manifest.json (on-demand; the capture pipeline only writes the manifest + preview PNGs)",
+  },
+  args: {
+    project: {
+      type: "positional",
+      description: "Path to the captured project directory",
+      required: true,
+    },
+    index: {
+      type: "string",
+      description: "Manifest entry index to download (0-based)",
+    },
+    url: {
+      type: "string",
+      description: "Exact video URL to download (must match a manifest entry)",
+    },
+    list: {
+      type: "boolean",
+      description: "List manifest entries (index, dimensions, heading) and exit",
+    },
+  },
+  // CLI orchestrator: arg parsing + manifest layout detection + lookup + download
+  // + snippet output + error paths. Cyclomatic count is intrinsic to the surface,
+  // not accidental complexity.
+  // fallow-ignore-next-line complexity
+  async run({ args }) {
+    const projectDir = resolve(String(args.project));
+    // `hyperframes capture <url> --output <dir>` writes <dir>/extracted/...
+    // A W2H project nests its capture output under <project>/capture/extracted/... instead.
+    // Accept both so this command works on either layout without the user juggling subdirs.
+    const directPath = join(projectDir, "extracted", "video-manifest.json");
+    const w2hPath = join(projectDir, "capture", "extracted", "video-manifest.json");
+    const manifestPath = existsSync(directPath) ? directPath : w2hPath;
+    const isW2hLayout = manifestPath === w2hPath;
+    if (!existsSync(manifestPath)) {
+      console.error(
+        `${c.error("✗")} no video-manifest.json at ${directPath} or ${w2hPath}\n` +
+          `  Was this directory produced by \`hyperframes capture\`?`,
+      );
+      process.exitCode = 1;
+      return;
+    }
+    let manifest: ManifestEntry[];
+    try {
+      manifest = JSON.parse(readFileSync(manifestPath, "utf-8"));
+    } catch (e) {
+      console.error(`${c.error("✗")} video-manifest.json is malformed: ${(e as Error).message}`);
+      process.exitCode = 1;
+      return;
+    }
+
+    if (args.list) {
+      if (manifest.length === 0) {
+        console.log(c.dim("(manifest is empty — no <video> elements on the captured page)"));
+        return;
+      }
+      console.log(
+        `${manifest.length} video entr${manifest.length === 1 ? "y" : "ies"} in ${manifestPath}:`,
+      );
+      for (const e of manifest) {
+        console.log(
+          `  ${c.bold(`[${e.index}]`)} ${e.filename} — ${e.width}×${e.height}` +
+            (e.heading ? `\n      heading: "${e.heading}"` : "") +
+            `\n      url: ${e.url}`,
+        );
+      }
+      return;
+    }
+
+    const pick = pickManifestEntry(manifest, args);
+    if (!pick.ok) {
+      console.error(
+        `${c.error("✗")} ${pick.message}` +
+          (pick.code === "no-match-url" ? `\n  Run with --list to see what's available.` : ""),
+      );
+      process.exitCode = 1;
+      return;
+    }
+    const entry = pick.entry;
+
+    // Match the layout detected from where the manifest was found.
+    const outDir = isW2hLayout
+      ? join(projectDir, "capture", "assets", "videos")
+      : join(projectDir, "assets", "videos");
+    mkdirSync(outDir, { recursive: true });
+    const fname = safeFilename(entry.filename || basename(entry.url));
+    const outPath = join(outDir, fname);
+    const relPath = isW2hLayout ? `capture/assets/videos/${fname}` : `assets/videos/${fname}`;
+
+    // No upstream existsSync precheck — that's a TOCTOU race source by
+    // construction (CodeQL js/file-system-race). The atomic exclusive
+    // create below is the single source of truth for "already downloaded?"
+    // and "create new file" — there is no gap for another process to win.
+    console.log(
+      `${c.accent("▸")} downloading [${entry.index}] ${entry.filename} (${entry.width}×${entry.height})`,
+    );
+    console.log(`     from: ${entry.url}`);
+    try {
+      const buf = await fetchToBuffer(entry.url);
+      // `flag: "wx"` = exclusive create: throws EEXIST if outPath exists.
+      // Race-free check-and-create in one syscall.
+      try {
+        writeFileSync(outPath, buf, { flag: "wx" });
+      } catch (writeErr) {
+        if ((writeErr as NodeJS.ErrnoException).code === "EEXIST") {
+          console.log(`${c.warn("⚠")}  already downloaded: ${relPath} (skipping)`);
+          console.log(`     Delete the file and re-run to refetch.`);
+          return;
+        }
+        throw writeErr;
+      }
+      const sizeKb = Math.round(buf.length / 1024);
+      const sizeStr = sizeKb > 1024 ? `${(sizeKb / 1024).toFixed(1)}MB` : `${sizeKb}KB`;
+      console.log(`${c.success("◇")}  wrote ${relPath} (${sizeStr})`);
+      // `id` is required by the media lint rule (`media_missing_id`) and the
+      // producer's media discovery; without one the video renders frozen.
+      const snippetId = `video-${entry.index}`;
+      console.log(
+        `     Reference it from a beat composition as:\n` +
+          `       <video id="${snippetId}" src="${relPath}" data-start="0" data-duration="${entry.width === entry.height ? 5 : 4}" data-track-index="0" autoplay muted loop></video>`,
+      );
+    } catch (e) {
+      console.error(`${c.error("✗")} download failed: ${(e as Error).message}`);
+      process.exitCode = 1;
+    }
+  },
+});
diff --git a/packages/cli/src/help.ts b/packages/cli/src/help.ts
index 44acb3737..34f1dd3b3 100644
--- a/packages/cli/src/help.ts
+++ b/packages/cli/src/help.ts
@@ -22,6 +22,7 @@ const GROUPS: Group[] = [
       ["init", "Scaffold a new composition project"],
       ["add", "Install a block or component from the registry"],
       ["capture", "Capture a website for video production"],
+      ["capture-video", "Download a video referenced in a capture manifest"],
       ["catalog", "Browse and install blocks and components"],
       ["preview", "Start the studio for previewing compositions"],
       ["publish", "Upload a project and get a stable public URL"],