From 4fa8df2c6a8b28c1c3067c87e4d64236d641cd83 Mon Sep 17 00:00:00 2001
From: Tharen Candi <tcan7909@uni.sydney.edu.au>
Date: Wed, 15 Apr 2026 17:38:46 +0200
Subject: [PATCH 1/3] feat: add stage3 segmentation model flag and SAM3/LangSAM
 docs

- Add --stage3-segmentation-model flag to run_pipeline.sh (sam3|langsam)
- Document SAM3 and LangSAM installation requirements
- Clarify Stage 1 uses SAM2, Stage 3a uses SAM3/LangSAM
- Update VLM-MASK-REASONER README
---
 VLM-MASK-REASONER/README.md       | 29 ++++++++++++++++++++++++++---
 VLM-MASK-REASONER/run_pipeline.sh | 29 ++++++++++++++++++++---------
 2 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/VLM-MASK-REASONER/README.md b/VLM-MASK-REASONER/README.md
index 4888f83..169cadb 100644
--- a/VLM-MASK-REASONER/README.md
+++ b/VLM-MASK-REASONER/README.md
@@ -45,8 +45,7 @@ Optional flags:
 
 ```bash
 bash run_pipeline.sh <config_points.json> \
-    --sam2-checkpoint ../sam2_hiera_large.pt \
-    --device cuda
+    --sam2-checkpoint ../sam2_hiera_large.pt \    --stage3-segmentation-model langsam \    --device cuda
 ```
 
 This runs four stages automatically:
@@ -84,7 +83,7 @@ Install the main requirements from the repo root:
 pip install -r requirements.txt
 ```
 
-### 2. SAM2
+### 2. SAM2 (Stage 1)
 
 SAM2 must be installed separately (it is not on PyPI):
 
@@ -99,6 +98,30 @@ Then download the SAM2 checkpoint. The pipeline defaults to `sam2_hiera_large.pt
 wget https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt
 ```
 
+### 3. SAM3 or LangSAM (Stage 3a)
+
+Stage 3a uses text-prompted segmentation to identify affected objects. You need **either** SAM3 or LangSAM:
+
+**Option A: SAM3**
+
+```bash
+pip install git+https://github.com/facebookresearch/segment-anything-3.git
+```
+
+Download the SAM3 checkpoint:
+```bash
+# Check SAM3 repo for latest checkpoint URLs
+wget https://dl.fbaipublicfiles.com/segment_anything_3/sam3_hiera_large.pt
+```
+
+**Option B: LangSAM (alternative)**
+
+```bash
+pip install lang-sam
+```
+
+To use LangSAM instead of SAM3, pass `--stage3-segmentation-model langsam` to `run_pipeline.sh`.
+
 If you place the checkpoint elsewhere, pass it explicitly:
 
 ```bash
diff --git a/VLM-MASK-REASONER/run_pipeline.sh b/VLM-MASK-REASONER/run_pipeline.sh
index f1b8d98..f3cef3e 100644
--- a/VLM-MASK-REASONER/run_pipeline.sh
+++ b/VLM-MASK-REASONER/run_pipeline.sh
@@ -3,30 +3,39 @@
 # Runs stages 1-4 given a points config JSON (output of point_selector_gui.py)
 #
 # Usage:
-#   bash run_pipeline.sh <config_points.json> [--sam2-checkpoint PATH] [--device cuda]
+#   bash run_pipeline.sh <config_points.json> [OPTIONS]
 #
-# Example:
+# Options:
+#   --sam2-checkpoint PATH          SAM2 checkpoint for stage 1 (default: ../sam2_hiera_large.pt)
+#   --device DEVICE                 cuda or cpu (default: cuda)
+#   --stage3-segmentation-model MODEL  sam3 or langsam for stage 3a (default: sam3)
+#
+# Examples:
 #   bash run_pipeline.sh my_config_points.json
 #   bash run_pipeline.sh my_config_points.json --sam2-checkpoint ../sam2_hiera_large.pt
+#   bash run_pipeline.sh my_config_points.json --stage3-segmentation-model langsam
 
 set -e
 
 # ── Arguments ──────────────────────────────────────────────────────────────────
 CONFIG="$1"
 if [ -z "$CONFIG" ]; then
-    echo "Usage: bash run_pipeline.sh <config_points.json> [--sam2-checkpoint PATH] [--device cuda]"
+    echo "Usage: bash run_pipeline.sh <config_points.json> [OPTIONS]"
+    echo "See script header for available options."
     exit 1
 fi
 
 SAM2_CHECKPOINT="../sam2_hiera_large.pt"
 DEVICE="cuda"
+STAGE3_SEGMENTATION_MODEL="sam3"
 
 # Parse optional flags
 shift
 while [[ $# -gt 0 ]]; do
     case "$1" in
-        --sam2-checkpoint) SAM2_CHECKPOINT="$2"; shift 2 ;;
-        --device)          DEVICE="$2";          shift 2 ;;
+        --sam2-checkpoint)          SAM2_CHECKPOINT="$2"; shift 2 ;;
+        --device)                   DEVICE="$2";          shift 2 ;;
+        --stage3-segmentation-model) STAGE3_SEGMENTATION_MODEL="$2"; shift 2 ;;
         *) echo "Unknown argument: $1"; exit 1 ;;
     esac
 done
@@ -36,9 +45,10 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 echo "=========================================="
 echo "Void Mask Generation Pipeline"
 echo "=========================================="
-echo "Config:          $CONFIG"
-echo "SAM2 checkpoint: $SAM2_CHECKPOINT"
-echo "Device:          $DEVICE"
+echo "Config:                   $CONFIG"
+echo "SAM2 checkpoint:          $SAM2_CHECKPOINT"
+echo "Device:                   $DEVICE"
+echo "Stage 3 segmentation:     $STAGE3_SEGMENTATION_MODEL"
 echo "=========================================="
 
 # ── Stage 1: SAM2 Segmentation ─────────────────────────────────────────────────
@@ -59,7 +69,8 @@ python "$SCRIPT_DIR/stage2_vlm_analysis.py" \
 echo ""
 echo "[3/4] Generating grey masks..."
 python "$SCRIPT_DIR/stage3a_generate_grey_masks_v2.py" \
-    --config "$CONFIG"
+    --config "$CONFIG" \
+    --segmentation-model "$STAGE3_SEGMENTATION_MODEL"
 
 # ── Stage 4: Combine into Quadmask ────────────────────────────────────────────
 echo ""

From 78caac4d0d8e3090cfbce24a66cec829d6a24dd3 Mon Sep 17 00:00:00 2001
From: Tharen Candi <tcan7909@uni.sydney.edu.au>
Date: Wed, 15 Apr 2026 18:00:09 +0200
Subject: [PATCH 2/3] fixed documentation for sam3 and langSAM

---
 VLM-MASK-REASONER/README.md                   | 32 +++++++++++++------
 .../stage3a_generate_grey_masks_v2.py         |  7 ----
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/VLM-MASK-REASONER/README.md b/VLM-MASK-REASONER/README.md
index 169cadb..3a6cb3b 100644
--- a/VLM-MASK-REASONER/README.md
+++ b/VLM-MASK-REASONER/README.md
@@ -102,25 +102,39 @@ wget https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.p
 
 Stage 3a uses text-prompted segmentation to identify affected objects. You need **either** SAM3 or LangSAM:
 
-**Option A: SAM3**
+**Option A: SAM3 (default, recommended)**
 
-```bash
-pip install git+https://github.com/facebookresearch/segment-anything-3.git
-```
+SAM3 requires Python 3.12+, PyTorch 2.7+, and HuggingFace authentication:
 
-Download the SAM3 checkpoint:
 ```bash
-# Check SAM3 repo for latest checkpoint URLs
-wget https://dl.fbaipublicfiles.com/segment_anything_3/sam3_hiera_large.pt
+# Install SAM3
+git clone https://github.com/facebookresearch/sam3.git
+cd sam3
+pip install -e .
+cd ..
 ```
 
+**Checkpoint access:** SAM3 checkpoints are hosted on HuggingFace and require authentication:
+
+1. Request access at https://huggingface.co/facebook/sam3.1
+2. Once approved, authenticate:
+   ```bash
+   pip install -U "huggingface_hub[cli]"
+   huggingface-cli login  # Enter your HF token
+   ```
+3. The model will auto-download checkpoints on first use
+
+See the [SAM3 repo](https://github.com/facebookresearch/sam3) for full installation details.
+
 **Option B: LangSAM (alternative)**
 
+LangSAM combines SAM 2.1 with GroundingDINO for text-prompted segmentation. Requires Python 3.10+:
+
 ```bash
-pip install lang-sam
+pip install -U git+https://github.com/luca-medeiros/lang-segment-anything.git
 ```
 
-To use LangSAM instead of SAM3, pass `--stage3-segmentation-model langsam` to `run_pipeline.sh`.
+LangSAM auto-downloads its checkpoints (GroundingDINO + SAM 2.1) and doesn't require authentication. To use it, pass `--stage3-segmentation-model langsam` to `run_pipeline.sh`.
 
 If you place the checkpoint elsewhere, pass it explicitly:
 
diff --git a/VLM-MASK-REASONER/stage3a_generate_grey_masks_v2.py b/VLM-MASK-REASONER/stage3a_generate_grey_masks_v2.py
index c41a847..e710087 100644
--- a/VLM-MASK-REASONER/stage3a_generate_grey_masks_v2.py
+++ b/VLM-MASK-REASONER/stage3a_generate_grey_masks_v2.py
@@ -45,13 +45,6 @@
 from PIL import Image
 import subprocess
 
-# SAM2 for video tracking
-try:
-    from sam2.build_sam import build_sam2_video_predictor
-    SAM2_AVAILABLE = True
-except ImportError:
-    SAM2_AVAILABLE = False
-
 # SAM3 for single-frame segmentation
 try:
     from sam3.model_builder import build_sam3_image_model

From d232e839f1e8820532a8c7733680345f28801dcb Mon Sep 17 00:00:00 2001
From: Tharen Candi <tcan7909@uni.sydney.edu.au>
Date: Wed, 15 Apr 2026 19:03:02 +0200
Subject: [PATCH 3/3] notebook to demo the VLM-MASK-REASONER

---
 .../notebook-vlm-mask-reasoner.ipynb          | 561 ++++++++++++++++++
 1 file changed, 561 insertions(+)
 create mode 100644 VLM-MASK-REASONER/notebook-vlm-mask-reasoner.ipynb

diff --git a/VLM-MASK-REASONER/notebook-vlm-mask-reasoner.ipynb b/VLM-MASK-REASONER/notebook-vlm-mask-reasoner.ipynb
new file mode 100644
index 0000000..f2fd2c4
--- /dev/null
+++ b/VLM-MASK-REASONER/notebook-vlm-mask-reasoner.ipynb
@@ -0,0 +1,561 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div align=\"center\">\n",
+    "\n",
+    "# VLM-MASK-REASONER for VOID: Quadmask Generation Pipeline\n",
+    "\n",
+    "**Automatic interaction-aware mask generation using SAM2, Gemini VLM, and LangSAM**\n",
+    "\n",
+    "[Project Page](https://void-model.github.io/) | [Paper](https://arxiv.org/abs/2604.02296) | [HuggingFace](https://huggingface.co/netflix/void-model)\n",
+    "\n",
+    "</div>\n",
+    "\n",
+    "This notebook demonstrates the **VLM-MASK-REASONER** pipeline that generates quadmasks for VOID video inpainting. The pipeline:\n",
+    "\n",
+    "1. **Stage 1:** Uses SAM2 to segment the primary object (what you want to remove)\n",
+    "2. **Stage 2:** Uses Gemini VLM to identify affected objects (shadows, reflections, interactions)\n",
+    "3. **Stage 3:** Uses LangSAM to segment affected objects\n",
+    "4. **Stage 4:** Combines everything into a quadmask with 4 semantic values (0, 63, 127, 255)\n",
+    "\n",
+    "**Requirements:** \n",
+    "- GPU runtime (T4 or better recommended)\n",
+    "- Gemini API key (free at https://aistudio.google.com/app/apikey)\n",
+    "\n",
+    "**Note:** This notebook uses **LangSAM** for Stage 3 instead of SAM3 for Python 3.10 compatibility on Colab."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Setup & Installation\n",
+    "\n",
+    "Install all required dependencies including SAM2, LangSAM, and download checkpoints."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Clone the repo (skip if already cloned)\n",
+    "!git clone git@github.com:tharencandi/void-model.git 2>/dev/null || echo \"Repo already exists\"\n",
+    "!git checkout feat/stage3-segmentation-model-flag\n",
+    "%cd /content/void-model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install main requirements\n",
+    "!pip install -q opencv-python-headless numpy pillow tqdm google-generativeai\n",
+    "\n",
+    "# Install SAM2 for Stage 1\n",
+    "!pip install -q git+https://github.com/facebookresearch/segment-anything-2.git\n",
+    "\n",
+    "# Install LangSAM for Stage 3 (Python 3.10 compatible)\n",
+    "!pip install -q -U git+https://github.com/luca-medeiros/lang-segment-anything.git\n",
+    "\n",
+    "print(\"✓ All dependencies installed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Download SAM2 Checkpoint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "sam2_checkpoint = Path(\"sam2_hiera_large.pt\")\n",
+    "\n",
+    "if not sam2_checkpoint.exists():\n",
+    "    print(\"Downloading SAM2 checkpoint (2.4GB)...\")\n",
+    "    !wget -q https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt\n",
+    "    print(\"✓ SAM2 checkpoint downloaded\")\n",
+    "else:\n",
+    "    print(\"✓ SAM2 checkpoint already exists\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Set Gemini API Key\n",
+    "\n",
+    "Get API key at: https://aistudio.google.com/app/apikey"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from google.colab import userdata\n",
+    "\n",
+    "try:\n",
+    "    # Get your Gemini API key from Colab secrets\n",
+    "    gemini_api_key = userdata.get('GEMINI_API_KEY')\n",
+    "    os.environ['GEMINI_API_KEY'] = gemini_api_key\n",
+    "    print(\"✓ Gemini API key loaded from Colab secrets.\")\n",
+    "except userdata.SecretNotFoundError:\n",
+    "    print(\"❌ GEMINI_API_KEY not found in Colab secrets. Please add it to Colab secrets or set it manually.\")\n",
+    "except Exception as e:\n",
+    "    print(f\"An error occurred: {e}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Prepare Sample Video & Config\n",
+    "\n",
+    "We'll use the \"lime\" sample - a glass being removed, causing the lime to fall.\n",
+    "\n",
+    "The pipeline requires a `config_points.json` file with point coordinates. You have **two options**:\n",
+    "\n",
+    "**Option A:** Use the interactive Point Selector GUI locally (recommended for precision)  \n",
+    "**Option B:** Manually create the config (good for quick testing)\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### Understanding config_points.json Format\n",
+    "\n",
+    "The config file specifies:\n",
+    "- **video_path**: Path to your input video\n",
+    "- **output_dir**: Where to save generated masks\n",
+    "- **instruction**: Text description of what to remove\n",
+    "- **primary_points**: Click coordinates on the object to segment (for SAM2)\n",
+    "  - `frame_idx`: Which frame to click on (0 = first frame)\n",
+    "  - `points`: List of [x, y] coordinates (click on the object)\n",
+    "  - `labels`: 1 for positive click (include), 0 for negative (exclude)\n",
+    "- **min_grid**: Grid size for gridification (8 is standard)\n",
+    "\n",
+    "**Example config_points.json:**\n",
+    "```json\n",
+    "{\n",
+    "  \"videos\": [\n",
+    "    {\n",
+    "      \"video_path\": \"/path/to/video.mp4\",\n",
+    "      \"output_dir\": \"/path/to/output\",\n",
+    "      \"instruction\": \"remove the glass\",\n",
+    "      \"primary_points\": [\n",
+    "        {\n",
+    "          \"frame_idx\": 0,\n",
+    "          \"points\": [[336, 240]],\n",
+    "          \"labels\": [1]\n",
+    "        }\n",
+    "      ],\n",
+    "      \"min_grid\": 8\n",
+    "    }\n",
+    "  ]\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "You can add multiple videos to the `videos` array, and add points on multiple frames if the object moves significantly."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Option A: Use Point Selector GUI Locally (Recommended)\n",
+    "\n",
+    "For precise point selection, you can run the interactive GUI tool locally on your machine:\n",
+    "\n",
+    "**Step 1:** Create a simple config file (without points):\n",
+    "```json\n",
+    "{\n",
+    "  \"videos\": [\n",
+    "    {\n",
+    "      \"video_path\": \"/path/to/your/video.mp4\",\n",
+    "      \"output_dir\": \"/path/to/output\",\n",
+    "      \"instruction\": \"remove the glass\"\n",
+    "    }\n",
+    "  ]\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "**Step 2:** Run the point selector GUI:\n",
+    "```bash\n",
+    "python VLM-MASK-REASONER/point_selector_gui.py --config my_config.json\n",
+    "```\n",
+    "\n",
+    "**Step 3:** Use the GUI to:\n",
+    "- Navigate frames with the slider or arrow keys\n",
+    "- Click on the object you want to remove (multiple clicks for better segmentation)\n",
+    "- Add points on multiple frames if the object moves significantly\n",
+    "- Press \"Save All Points\" or Space to save\n",
+    "\n",
+    "This creates `my_config_points.json` with exact coordinates.\n",
+    "\n",
+    "**Step 4:** Upload the `_points.json` file to Colab and use it in the pipeline.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### Option B: Manual Config Creation (Quick Start)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "### Demo: Using Manual Config Creation (Option B)\n",
+    "\n",
+    "For this demo, we'll manually create the config. The code below creates a working config for the lime sample with pre-selected coordinates."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Create config for the lime sample video with actual coordinates\n",
+    "# To use your own video: replace video_path, adjust points coordinates\n",
+    "config_data = {\n",
+    "    \"videos\": [\n",
+    "        {\n",
+    "            \"video_path\": \"sample/lime/input_video.mp4\",      # Path to video\n",
+    "            \"output_dir\": \"sample/lime/masks_output\",          # Output directory\n",
+    "            \"instruction\": \"remove the glass\",                  # What to remove\n",
+    "            \"primary_points_by_frame\": {\n",
+    "                \"0\": [\n",
+    "                    [2150, 1243],\n",
+    "                    [2702, 1276],\n",
+    "                    [2452, 1886],\n",
+    "                    [2448, 1507],\n",
+    "                    [2424, 1104]\n",
+    "                ]\n",
+    "            }, # Frame 0: click on glass at [x, y]\n",
+    "            \"primary_frames\": [\n",
+    "                0\n",
+    "            ],           # Frames with points\n",
+    "            \"first_appears_frame\": 0,        # First frame where object appears\n",
+    "            \"primary_points\": [\n",
+    "                [2150, 1243],\n",
+    "                [2702, 1276],\n",
+    "                [2452, 1886],\n",
+    "                [2448, 1507],\n",
+    "                [2424, 1104]\n",
+    "            ], # Flattened points (for backward compatibility)\n",
+    "            \"min_grid\": 8                    # Grid size for VLM analysis\n",
+    "        }\n",
+    "    ]\n",
+    "}\n",
+    "\n",
+    "# Save config file\n",
+    "config_path = Path(\"config_points.json\")\n",
+    "with open(config_path, 'w') as f:\n",
+    "    json.dump(config_data, f, indent=2)\n",
+    "\n",
+    "print(\"\\n💡 For precise point selection, use the GUI tool from Option A!\")\n",
+    "\n",
+    "output_dir = Path(config_data[\"videos\"][0][\"output_dir\"])print(\"   5. For multi-frame: '0': [[x1,y1]], '30': [[x2,y2]]\")\n",
+    "\n",
+    "output_dir.mkdir(parents=True, exist_ok=True)print(\"   4. Add more points: '0': [[x1,y1], [x2,y2], ...]\")\n",
+    "\n",
+    "print(\"   3. Update coordinates in 'primary_points_by_frame'\")\n",
+    "\n",
+    "print(f\"✓ Config created: {config_path}\")print(\"   2. Edit config_data above - update video_path and output_dir\")\n",
+    "\n",
+    "print(f\"✓ Output directory: {output_dir}\")print(\"   1. Upload your video file to Colab\")\n",
+    "\n",
+    "print(f\"✓ Using point coordinates: {config_data['videos'][0]['primary_points']}\")print(\"\\n📝 To use your own video:\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Run the Pipeline\n",
+    "\n",
+    "You have **two options** to run the 4-stage pipeline:\n",
+    "\n",
+    "**Option A:** Run all stages at once with `run_pipeline.sh` (recommended - single command)  \n",
+    "**Option B:** Run each stage individually (good for debugging or customization)\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### Option A: Run All Stages at Once (Recommended)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run the complete pipeline with a single command\n",
+    "# This runs all 4 stages automatically: SAM2 → Gemini VLM → LangSAM → Combine\n",
+    "\n",
+    "!bash VLM-MASK-REASONER/run_pipeline.sh config_points.json \\\n",
+    "    --sam2-checkpoint sam2_hiera_large.pt \\\n",
+    "    --stage3-segmentation-model langsam \\\n",
+    "    --device cuda\n",
+    "\n",
+    "print(\"\\n\" + \"=\" * 70)\n",
+    "print(\"✓ Complete Pipeline Finished!\")\n",
+    "print(\"=\" * 70)\n",
+    "print(\"\\nGenerated files in output directory:\")\n",
+    "print(\"  - black_mask.mp4 (Stage 1)\")\n",
+    "print(\"  - vlm_analysis.json (Stage 2)\")\n",
+    "print(\"  - grey_mask.mp4 (Stage 3)\")\n",
+    "print(\"  - quadmask_0.mp4 (Stage 4)\")\n",
+    "print(\"\\nContinue to Section 6 to visualize results.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "### Option B: Run Stages Individually\n",
+    "\n",
+    "Run each stage separately if you want to inspect intermediate results or debug issues.\n",
+    "\n",
+    "**Stage 1:** SAM2 Segmentation (primary object)  \n",
+    "**Stage 2:** Gemini VLM Analysis (identify affected objects)  \n",
+    "**Stage 3:** LangSAM Segmentation (affected objects)  \n",
+    "**Stage 4:** Combine into Quadmask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run Stage 1: SAM2 Segmentation\n",
+    "print(\"=\" * 70)\n",
+    "print(\"Stage 1: SAM2 Segmentation\")\n",
+    "print(\"=\" * 70)\n",
+    "\n",
+    "!python VLM-MASK-REASONER/stage1_sam2_segmentation.py \\\n",
+    "    --config config_points.json \\\n",
+    "    --sam2-checkpoint sam2_hiera_large.pt \\\n",
+    "    --device cuda\n",
+    "\n",
+    "print(\"\\n✓ Stage 1 complete: black_mask.mp4 generated\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run Stage 2: VLM Analysis with Gemini\n",
+    "print(\"\\n\" + \"=\" * 70)\n",
+    "print(\"Stage 2: VLM Analysis (Gemini)\")\n",
+    "print(\"=\" * 70)\n",
+    "\n",
+    "!python VLM-MASK-REASONER/stage2_vlm_analysis.py \\\n",
+    "    --config config_points.json\n",
+    "\n",
+    "print(\"\\n✓ Stage 2 complete: vlm_analysis.json generated\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run Stage 3: Grey Mask Generation with LangSAM\n",
+    "print(\"\\n\" + \"=\" * 70)\n",
+    "print(\"Stage 3: Generate Grey Masks (LangSAM)\")\n",
+    "print(\"=\" * 70)\n",
+    "\n",
+    "!python VLM-MASK-REASONER/stage3a_generate_grey_masks_v2.py \\\n",
+    "    --config config_points.json \\\n",
+    "    --segmentation-model langsam\n",
+    "\n",
+    "print(\"\\n✓ Stage 3 complete: grey_mask.mp4 generated\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run Stage 4: Combine into Quadmask\n",
+    "print(\"\\n\" + \"=\" * 70)\n",
+    "print(\"Stage 4: Combine Masks into Quadmask\")\n",
+    "print(\"=\" * 70)\n",
+    "\n",
+    "!python VLM-MASK-REASONER/stage4_combine_masks.py \\\n",
+    "    --config config_points.json\n",
+    "\n",
+    "print(\"\\n✓ Stage 4 complete: quadmask_0.mp4 generated\")\n",
+    "print(\"\\n\" + \"=\" * 70)\n",
+    "print(\"Pipeline Complete!\")\n",
+    "print(\"=\" * 70)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Visualize Results\n",
+    "\n",
+    "Display the generated masks and VLM analysis to verify the pipeline output."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cv2\n",
+    "import numpy as np\n",
+    "from IPython.display import Image, display\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Read first frame of each video\n",
+    "def read_first_frame(video_path):\n",
+    "    cap = cv2.VideoCapture(str(video_path))\n",
+    "    ret, frame = cap.read()\n",
+    "    cap.release()\n",
+    "    if ret:\n",
+    "        return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n",
+    "    return None\n",
+    "\n",
+    "# Load frames\n",
+    "input_frame = read_first_frame(output_dir / \"input_video.mp4\")\n",
+    "black_mask = read_first_frame(output_dir / \"black_mask.mp4\")\n",
+    "grey_mask = read_first_frame(output_dir / \"grey_mask.mp4\")\n",
+    "quadmask = read_first_frame(output_dir / \"quadmask_0.mp4\")\n",
+    "\n",
+    "# Create visualization\n",
+    "fig, axes = plt.subplots(2, 2, figsize=(14, 14))\n",
+    "\n",
+    "axes[0, 0].imshow(input_frame)\n",
+    "axes[0, 0].set_title(\"Input Video (Frame 0)\", fontsize=14, fontweight='bold')\n",
+    "axes[0, 0].axis('off')\n",
+    "\n",
+    "axes[0, 1].imshow(black_mask, cmap='gray')\n",
+    "axes[0, 1].set_title(\"Black Mask (Primary Object)\\n0=remove, 255=keep\", fontsize=14, fontweight='bold')\n",
+    "axes[0, 1].axis('off')\n",
+    "\n",
+    "axes[1, 0].imshow(grey_mask, cmap='gray')\n",
+    "axes[1, 0].set_title(\"Grey Mask (Affected Objects)\\n127=affected, 255=background\", fontsize=14, fontweight='bold')\n",
+    "axes[1, 0].axis('off')\n",
+    "\n",
+    "axes[1, 1].imshow(quadmask)\n",
+    "axes[1, 1].set_title(\"Final Quadmask\\n0=primary, 63=overlap, 127=affected, 255=bg\", fontsize=14, fontweight='bold')\n",
+    "axes[1, 1].axis('off')\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.savefig(output_dir / \"pipeline_results.png\", dpi=150, bbox_inches='tight')\n",
+    "plt.show()\n",
+    "\n",
+    "print(\"✓ Visualization saved to:\", output_dir / \"pipeline_results.png\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Show VLM analysis results\n",
+    "import json\n",
+    "\n",
+    "vlm_analysis_path = output_dir / \"vlm_analysis.json\"\n",
+    "if vlm_analysis_path.exists():\n",
+    "    print(\"=\" * 70)\n",
+    "    print(\"Gemini VLM Analysis Results\")\n",
+    "    print(\"=\" * 70)\n",
+    "    \n",
+    "    with open(vlm_analysis_path, 'r') as f:\n",
+    "        analysis = json.load(f)\n",
+    "    \n",
+    "    print(f\"\\nPrimary Object: {analysis.get('primary_object', 'N/A')}\")\n",
+    "    print(f\"Scene Description: {analysis.get('scene_description', 'N/A')}\")\n",
+    "    \n",
+    "    affected = analysis.get('affected_objects', [])\n",
+    "    print(f\"\\nAffected Objects ({len(affected)}):\")\n",
+    "    for i, obj in enumerate(affected, 1):\n",
+    "        print(f\"  {i}. {obj.get('noun', 'N/A')} - {obj.get('reasoning', 'N/A')}\")\n",
+    "    \n",
+    "    print(\"\\n\" + \"=\" * 70)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Download Results\n",
+    "\n",
+    "Download the quadmask and use it with VOID Pass 1 inference.\n",
+    "\n",
+    "**Quadmask Values:**\n",
+    "- **0 (black):** Primary object to remove\n",
+    "- **63:** Overlap region\n",
+    "- **127 (grey):** Affected objects\n",
+    "- **255 (white):** Background\n",
+    "\n",
+    "**Try other samples:** Change coordinates to `\"moving_ball\"` or `\"pillow\"` sample videos\n",
+    "\n",
+    "**Use your own video:** Upload a video, modify the config with correct path and click points, re-run from Section 5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from google.colab import files\n",
+    "\n",
+    "# Download the quadmask\n",
+    "quadmask_path = output_dir / \"quadmask_0.mp4\"\n",
+    "if quadmask_path.exists():\n",
+    "    print(\"Downloading quadmask_0.mp4...\")\n",
+    "    files.download(str(quadmask_path))\n",
+    "    print(\"✓ Download complete!\")\n",
+    "else:\n",
+    "    print(\"❌ Quadmask not found. Check pipeline output above for errors.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}