niubi2013
diff --git a/‎docs/guides/remove-background.mdx‎
Lines changed: 65 additions & 0 deletions b/‎docs/guides/remove-background.mdx‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎docs/packages/cli.mdx‎
Lines changed: 6 additions & 1 deletion b/‎docs/packages/cli.mdx‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎packages/cli/src/background-removal/inference.test.ts‎
Lines changed: 110 additions & 1 deletion b/‎packages/cli/src/background-removal/inference.test.ts‎
Lines changed: 110 additions & 1 deletion
diff --git a/‎packages/cli/src/background-removal/inference.ts‎
Lines changed: 82 additions & 15 deletions b/‎packages/cli/src/background-removal/inference.ts‎
Lines changed: 82 additions & 15 deletions
@@ -80,6 +80,71 @@ npx hyperframes remove-background subject.mp4 -o transparent.mov         # editi
 npx hyperframes remove-background portrait.jpg -o cutout.png       # still image
 ```
 
+## Layer separation: emit the cutout and the background plate together
+
+Pass `--background-output` (alias `-b`) to write a *second* transparent video alongside the cutout. Same source RGB, alpha is the *inverse* mask — opaque where the surroundings were, transparent where the subject is. The result is a clean two-layer separation in a single inference pass:
+
+```bash Terminal
+npx hyperframes remove-background subject.mp4 \
+  -o subject.webm \
+  --background-output plate.webm
+```
+
+| Output | Alpha | Use it as |
+| ------ | ----- | --------- |
+| `subject.webm` | Mask — subject opaque | Foreground layer (top of stack) |
+| `plate.webm` | `255 − mask` — subject region transparent | Background layer; place anything you want **under the subject's silhouette** between this and `subject.webm` |
+
+Both encoders share the source W/H/fps and your `--quality` preset, so the layers are pixel-aligned. Encode cost roughly doubles; segmentation cost is unchanged.
+
+<Tip>
+**This is a hole-cut plate, not an inpainted clean plate.** The subject region in `plate.webm` is fully transparent — you have to composite something opaque under it (a graphic, a blurred copy, a different scene) to fill the hole. If you need an actual filled background where the subject was, use a video inpainter (LaMa, ProPainter, RunwayML Inpaint) — `remove-background` is not the right tool for that.
+</Tip>
+
+### Hole-cut vs. clean plate — when does the difference matter?
+
+A **hole-cut plate** keeps the original surroundings and makes the subject region transparent. A **clean plate** fills the subject region with reconstructed background — produced by a separate inpainting model. Display each alone over black:
+
+| | Hole-cut plate (this command) | Clean plate (inpainted) |
+| --- | --- | --- |
+| Subject region | Transparent silhouette | Reconstructed background pixels |
+| What you see alone | A person-shaped hole | An empty room |
+| Cost | One inference pass, one extra ffmpeg encode | A second model (LaMa, ProPainter, E2FGVI) |
+| Tool | `remove-background --background-output` | Outside this CLI |
+
+The line is: **does anything ever need to be visible *through* the subject's silhouette where the subject used to be?**
+
+| Use case | What you need |
+| --- | --- |
+| Text/graphics live *between* the cutout and the plate (the example above) | **Hole-cut** — the graphics fill the hole. |
+| Composite the subject onto an unrelated scene | Neither. Just use `subject.webm`; the plate is irrelevant. |
+| Show "the room without the person" as a real background | **Clean plate** — a hole-cut plate would show a transparent void. |
+| Replace the person with a different subject (re-target) | **Clean plate** — the new subject needs real pixels under it. |
+| VFX rotoscoping / "remove an extra from this take" | **Clean plate** — the canonical inpainting use case. |
+
+If something opaque always covers the silhouette, hole-cut is sufficient and ~1000× cheaper than running an inpainter.
+
+### The two-layer composition pattern
+
+The two-layer pattern is functionally a drop-in for [text-behind-subject](#text-behind-subject-the-recommended-layout) without needing the original `presenter.mp4` in the project — the plate replaces it as the bottom layer:
+
+```html
+<!-- z=1 inverse-alpha plate fills everything except the subject's silhouette -->
+<video src="plate.webm" data-start="0" data-duration="6" data-track-index="0" muted playsinline></video>
+
+<!-- z=2 anything you want occluded by the subject lives here -->
+<h1 style="z-index:2; position:absolute; top:50%; left:50%; transform:translate(-50%,-50%);">
+  MAKE IT IN HYPERFRAMES
+</h1>
+
+<!-- z=3 the cutout puts the subject back on top -->
+<div class="cutout-wrap" style="position:absolute;inset:0;z-index:3">
+  <video src="subject.webm" data-start="0" data-duration="6" data-track-index="1" muted playsinline></video>
+</div>
+```
+
+Constraints: the flag requires a video input and `.webm` or `.mov` for both outputs. It's not valid for image inputs (no temporal pairing to do) and won't accept `.png` for the plate.
+
 ## Performance
 
 Real-world numbers from the [matting eval](https://www.heygenverse.com/a/0dd5a431-1832-4858-862d-de7fb7d02654), running u²-net_human_seg on a 4-second 1080p clip:
 
@@ -356,6 +356,10 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_
     # Single image → transparent PNG
     npx hyperframes remove-background portrait.jpg -o cutout.png
 
+    # Layer separation: cutout AND inverse-alpha background plate in one pass
+    npx hyperframes remove-background avatar.mp4 \
+      -o subject.webm --background-output plate.webm
+
     # Force CPU on a machine that has CoreML or CUDA
     npx hyperframes remove-background avatar.mp4 -o transparent.webm --device cpu
 
@@ -366,8 +370,9 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_
     | Flag | Description |
     |------|-------------|
     | `--output, -o` | Output path. Format inferred from extension: `.webm` (default), `.mov`, `.png` |
+    | `--background-output, -b` | Optional second output: inverse-alpha background plate (subject region transparent, surroundings opaque). Same source RGB, complementary mask. Must be `.webm` or `.mov`. Hole-cut, not inpainted — composite something underneath to fill the hole. |
     | `--device` | Execution provider: `auto` (default), `cpu`, `coreml`, `cuda` |
-    | `--quality` | WebM encoder preset: `fast` (crf 30, smallest), `balanced` (crf 18, default), `best` (crf 12, near-lossless). Higher quality keeps the cutout's RGB closer to the source mp4 — important when overlaying the cutout on its own source for text-behind-subject effects. Ignored for `.mov` / `.png`. |
+    | `--quality` | WebM encoder preset: `fast` (crf 30, smallest), `balanced` (crf 18, default), `best` (crf 12, near-lossless). Higher quality keeps the cutout's RGB closer to the source mp4 — important when overlaying the cutout on its own source for text-behind-subject effects. Applies to both `--output` and `--background-output`. Ignored for `.mov` / `.png`. |
     | `--info` | Print detected execution providers and exit (no render) |
     | `--json` | Output result as JSON |
 
 
@@ -1,5 +1,5 @@
 import { describe, expect, it } from "vitest";
-import { MEAN, STD } from "./inference.js";
+import { MEAN, STD, applyMask } from "./inference.js";
 
 // Regression: the u2net_human_seg model was trained with ImageNet
 // normalization. Drifting away from these exact values changes the input
@@ -16,3 +16,112 @@ describe("background-removal/inference — rembg u2net_human_seg parity", () =>
     expect(STD).toEqual([0.229, 0.224, 0.225]);
   });
 });
+
+// These tests pin the contract that `--background-output` is built on:
+// fg.alpha + bg.alpha === 255 per pixel, and the RGB plane is byte-identical
+// between fg and bg. A future change to the postprocess loop (different mask
+// threshold, premultiplied alpha, gamma-corrected compositing) that breaks
+// either invariant should fail here loudly.
+describe("background-removal/inference — applyMask invariants", () => {
+  function makeRgb(pixels: number): Buffer {
+    // Deterministic but non-trivial RGB so byte equality is meaningful.
+    const buf = Buffer.allocUnsafe(pixels * 3);
+    for (let i = 0; i < pixels; i++) {
+      buf[i * 3] = (i * 7) & 0xff;
+      buf[i * 3 + 1] = (i * 13 + 31) & 0xff;
+      buf[i * 3 + 2] = (i * 19 + 61) & 0xff;
+    }
+    return buf;
+  }
+
+  function makeMask(pixels: number): Buffer {
+    // Hit the saturation endpoints (0, 255) and a few mid-tone values so the
+    // 255-m inversion is exercised across the full byte range.
+    const buf = Buffer.allocUnsafe(pixels);
+    for (let i = 0; i < pixels; i++) buf[i] = (i * 37) & 0xff;
+    return buf;
+  }
+
+  it("dual-output: fg.alpha + bg.alpha === 255 for every pixel", () => {
+    const pixels = 64;
+    const rgb = makeRgb(pixels);
+    const mask = makeMask(pixels);
+    const fg = Buffer.allocUnsafe(pixels * 4);
+    const bg = Buffer.allocUnsafe(pixels * 4);
+
+    const result = applyMask(rgb, mask, fg, bg, pixels);
+
+    expect(result.fg).toBe(fg);
+    expect(result.bg).toBe(bg);
+    for (let i = 0; i < pixels; i++) {
+      const sum = fg[i * 4 + 3]! + bg[i * 4 + 3]!;
+      expect(sum).toBe(255);
+    }
+  });
+
+  it("dual-output: RGB triples are byte-identical between fg and bg", () => {
+    const pixels = 64;
+    const rgb = makeRgb(pixels);
+    const mask = makeMask(pixels);
+    const fg = Buffer.allocUnsafe(pixels * 4);
+    const bg = Buffer.allocUnsafe(pixels * 4);
+
+    applyMask(rgb, mask, fg, bg, pixels);
+
+    for (let i = 0; i < pixels; i++) {
+      expect(fg[i * 4]).toBe(bg[i * 4]);
+      expect(fg[i * 4 + 1]).toBe(bg[i * 4 + 1]);
+      expect(fg[i * 4 + 2]).toBe(bg[i * 4 + 2]);
+      // And both match the source.
+      expect(fg[i * 4]).toBe(rgb[i * 3]);
+      expect(fg[i * 4 + 1]).toBe(rgb[i * 3 + 1]);
+      expect(fg[i * 4 + 2]).toBe(rgb[i * 3 + 2]);
+    }
+  });
+
+  it("dual-output: fg.alpha equals the input mask", () => {
+    const pixels = 32;
+    const rgb = makeRgb(pixels);
+    const mask = makeMask(pixels);
+    const fg = Buffer.allocUnsafe(pixels * 4);
+    const bg = Buffer.allocUnsafe(pixels * 4);
+
+    applyMask(rgb, mask, fg, bg, pixels);
+
+    for (let i = 0; i < pixels; i++) {
+      expect(fg[i * 4 + 3]).toBe(mask[i]);
+    }
+  });
+
+  it("single-output: bg=null returns bg=null and writes only fg", () => {
+    const pixels = 32;
+    const rgb = makeRgb(pixels);
+    const mask = makeMask(pixels);
+    const fg = Buffer.allocUnsafe(pixels * 4);
+
+    const result = applyMask(rgb, mask, fg, null, pixels);
+
+    expect(result.bg).toBeNull();
+    expect(result.fg).toBe(fg);
+    for (let i = 0; i < pixels; i++) {
+      expect(fg[i * 4]).toBe(rgb[i * 3]);
+      expect(fg[i * 4 + 3]).toBe(mask[i]);
+    }
+  });
+
+  it("saturates correctly at mask=0 and mask=255", () => {
+    // mask=0 → fg.alpha=0 (transparent subject), bg.alpha=255 (fully opaque plate)
+    // mask=255 → fg.alpha=255 (fully opaque subject), bg.alpha=0 (transparent plate)
+    const rgb = Buffer.from([10, 20, 30, 40, 50, 60]);
+    const mask = Buffer.from([0, 255]);
+    const fg = Buffer.allocUnsafe(8);
+    const bg = Buffer.allocUnsafe(8);
+
+    applyMask(rgb, mask, fg, bg, 2);
+
+    expect(fg[3]).toBe(0);
+    expect(bg[3]).toBe(255);
+    expect(fg[7]).toBe(255);
+    expect(bg[7]).toBe(0);
+  });
+});
@@ -24,10 +24,24 @@ interface OrtModule {
   Tensor: typeof Tensor;
 }
 
+export interface SessionResult {
+  /** Subject opaque, background fully transparent. */
+  fg: Buffer;
+  /** Inverse-alpha plate: same RGB, alpha is `255 − mask`. Null unless `withBackground` was true. */
+  bg: Buffer | null;
+}
+
 export interface Session {
-  /** Run inference on one RGB frame, return RGBA bytes (H*W*4). */
-  process(rgb: Buffer, width: number, height: number): Promise<Buffer>;
-  /** ORT EP that was actually selected. */
+  /**
+   * Both `fg` and `bg` (when requested) are session-owned buffers reused on the
+   * next call — drain the encoder's stdin before invoking `process` again.
+   */
+  process(
+    rgb: Buffer,
+    width: number,
+    height: number,
+    withBackground?: boolean,
+  ): Promise<SessionResult>;
   provider: string;
   close(): Promise<void>;
 }
@@ -73,16 +87,15 @@ export async function createSession(options: CreateSessionOptions = {}): Promise
     throw new Error("ONNX session is missing input or output bindings");
   }
 
-  // Pre-allocated per-frame buffers reused across every process() call.
-  // At 1080p this saves ~9 MB of allocations per frame. rgbaBuf is sized
-  // lazily on the first call (we don't know W/H until then).
+  // Reused across calls; sized lazily on first frame. Saves ~9 MB/frame at 1080p.
   const inputData = new Float32Array(3 * INPUT_PLANE);
   const maskBuf = Buffer.allocUnsafe(INPUT_PLANE);
   let rgbaBuf: Buffer | null = null;
+  let rgbaBgBuf: Buffer | null = null;
 
   return {
     provider: providerUsed,
-    async process(rgb, width, height) {
+    async process(rgb, width, height, withBackground = false) {
       const tensor = await preprocess(sharp, ort, rgb, width, height, inputData);
       const outputs = await session.run({ [inputName]: tensor });
       const output = outputs[outputName];
@@ -91,7 +104,21 @@ export async function createSession(options: CreateSessionOptions = {}): Promise
       if (!rgbaBuf || rgbaBuf.length !== expectedBytes) {
         rgbaBuf = Buffer.allocUnsafe(expectedBytes);
       }
-      return await postprocess(sharp, output, rgb, width, height, maskBuf, rgbaBuf);
+      if (withBackground) {
+        if (!rgbaBgBuf || rgbaBgBuf.length !== expectedBytes) {
+          rgbaBgBuf = Buffer.allocUnsafe(expectedBytes);
+        }
+      }
+      return await postprocess(
+        sharp,
+        output,
+        rgb,
+        width,
+        height,
+        maskBuf,
+        rgbaBuf,
+        withBackground ? rgbaBgBuf : null,
+      );
     },
     async close() {
       await session.release();
@@ -141,7 +168,8 @@ async function postprocess(
   height: number,
   maskBuf: Buffer,
   rgbaBuf: Buffer,
-): Promise<Buffer> {
+  rgbaBgBuf: Buffer | null,
+): Promise<SessionResult> {
   const raw = output.data as Float32Array;
 
   let lo = Infinity;
@@ -172,11 +200,50 @@ async function postprocess(
     .raw()
     .toBuffer();
 
-  for (let i = 0; i < width * height; i++) {
-    rgbaBuf[i * 4] = rgb[i * 3]!;
-    rgbaBuf[i * 4 + 1] = rgb[i * 3 + 1]!;
-    rgbaBuf[i * 4 + 2] = rgb[i * 3 + 2]!;
-    rgbaBuf[i * 4 + 3] = fullMask[i]!;
+  return applyMask(rgb, fullMask, rgbaBuf, rgbaBgBuf, width * height);
+}
+
+/**
+ * Composite the RGB source frame with the segmentation mask into one or two
+ * RGBA buffers. The contract this PR is built on:
+ *  - `fg`'s alpha is the mask, `bg`'s alpha (when provided) is `255 − mask`,
+ *    so `fg.alpha + bg.alpha === 255` for every pixel.
+ *  - RGB triples are byte-identical between `fg` and `bg`.
+ *  - When `bg` is null, only `fg` is touched.
+ *
+ * Exported for direct unit testing of the invariants above without spinning
+ * up an ONNX session.
+ */
+export function applyMask(
+  rgb: Buffer,
+  mask: Buffer,
+  fg: Buffer,
+  bg: Buffer | null,
+  pixels: number,
+): SessionResult {
+  if (bg) {
+    for (let i = 0; i < pixels; i++) {
+      const r = rgb[i * 3]!;
+      const g = rgb[i * 3 + 1]!;
+      const b = rgb[i * 3 + 2]!;
+      const m = mask[i]!;
+      const o = i * 4;
+      fg[o] = r;
+      fg[o + 1] = g;
+      fg[o + 2] = b;
+      fg[o + 3] = m;
+      bg[o] = r;
+      bg[o + 1] = g;
+      bg[o + 2] = b;
+      bg[o + 3] = 255 - m;
+    }
+    return { fg, bg };
+  }
+  for (let i = 0; i < pixels; i++) {
+    fg[i * 4] = rgb[i * 3]!;
+    fg[i * 4 + 1] = rgb[i * 3 + 1]!;
+    fg[i * 4 + 2] = rgb[i * 3 + 2]!;
+    fg[i * 4 + 3] = mask[i]!;
   }
-  return rgbaBuf;
+  return { fg, bg: null };
 }