From 4fa8df2c6a8b28c1c3067c87e4d64236d641cd83 Mon Sep 17 00:00:00 2001 From: Tharen Candi Date: Wed, 15 Apr 2026 17:38:46 +0200 Subject: [PATCH 1/3] feat: add stage3 segmentation model flag and SAM3/LangSAM docs - Add --stage3-segmentation-model flag to run_pipeline.sh (sam3|langsam) - Document SAM3 and LangSAM installation requirements - Clarify Stage 1 uses SAM2, Stage 3a uses SAM3/LangSAM - Update VLM-MASK-REASONER README --- VLM-MASK-REASONER/README.md | 29 ++++++++++++++++++++++++++--- VLM-MASK-REASONER/run_pipeline.sh | 29 ++++++++++++++++++++--------- 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/VLM-MASK-REASONER/README.md b/VLM-MASK-REASONER/README.md index 4888f83..169cadb 100644 --- a/VLM-MASK-REASONER/README.md +++ b/VLM-MASK-REASONER/README.md @@ -45,8 +45,7 @@ Optional flags: ```bash bash run_pipeline.sh \ - --sam2-checkpoint ../sam2_hiera_large.pt \ - --device cuda + --sam2-checkpoint ../sam2_hiera_large.pt \ --stage3-segmentation-model langsam \ --device cuda ``` This runs four stages automatically: @@ -84,7 +83,7 @@ Install the main requirements from the repo root: pip install -r requirements.txt ``` -### 2. SAM2 +### 2. SAM2 (Stage 1) SAM2 must be installed separately (it is not on PyPI): @@ -99,6 +98,30 @@ Then download the SAM2 checkpoint. The pipeline defaults to `sam2_hiera_large.pt wget https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt ``` +### 3. SAM3 or LangSAM (Stage 3a) + +Stage 3a uses text-prompted segmentation to identify affected objects. You need **either** SAM3 or LangSAM: + +**Option A: SAM3** + +```bash +pip install git+https://github.com/facebookresearch/segment-anything-3.git +``` + +Download the SAM3 checkpoint: +```bash +# Check SAM3 repo for latest checkpoint URLs +wget https://dl.fbaipublicfiles.com/segment_anything_3/sam3_hiera_large.pt +``` + +**Option B: LangSAM (alternative)** + +```bash +pip install lang-sam +``` + +To use LangSAM instead of SAM3, pass `--stage3-segmentation-model langsam` to `run_pipeline.sh`. + If you place the checkpoint elsewhere, pass it explicitly: ```bash diff --git a/VLM-MASK-REASONER/run_pipeline.sh b/VLM-MASK-REASONER/run_pipeline.sh index f1b8d98..f3cef3e 100644 --- a/VLM-MASK-REASONER/run_pipeline.sh +++ b/VLM-MASK-REASONER/run_pipeline.sh @@ -3,30 +3,39 @@ # Runs stages 1-4 given a points config JSON (output of point_selector_gui.py) # # Usage: -# bash run_pipeline.sh [--sam2-checkpoint PATH] [--device cuda] +# bash run_pipeline.sh [OPTIONS] # -# Example: +# Options: +# --sam2-checkpoint PATH SAM2 checkpoint for stage 1 (default: ../sam2_hiera_large.pt) +# --device DEVICE cuda or cpu (default: cuda) +# --stage3-segmentation-model MODEL sam3 or langsam for stage 3a (default: sam3) +# +# Examples: # bash run_pipeline.sh my_config_points.json # bash run_pipeline.sh my_config_points.json --sam2-checkpoint ../sam2_hiera_large.pt +# bash run_pipeline.sh my_config_points.json --stage3-segmentation-model langsam set -e # ── Arguments ────────────────────────────────────────────────────────────────── CONFIG="$1" if [ -z "$CONFIG" ]; then - echo "Usage: bash run_pipeline.sh [--sam2-checkpoint PATH] [--device cuda]" + echo "Usage: bash run_pipeline.sh [OPTIONS]" + echo "See script header for available options." exit 1 fi SAM2_CHECKPOINT="../sam2_hiera_large.pt" DEVICE="cuda" +STAGE3_SEGMENTATION_MODEL="sam3" # Parse optional flags shift while [[ $# -gt 0 ]]; do case "$1" in - --sam2-checkpoint) SAM2_CHECKPOINT="$2"; shift 2 ;; - --device) DEVICE="$2"; shift 2 ;; + --sam2-checkpoint) SAM2_CHECKPOINT="$2"; shift 2 ;; + --device) DEVICE="$2"; shift 2 ;; + --stage3-segmentation-model) STAGE3_SEGMENTATION_MODEL="$2"; shift 2 ;; *) echo "Unknown argument: $1"; exit 1 ;; esac done @@ -36,9 +45,10 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" echo "==========================================" echo "Void Mask Generation Pipeline" echo "==========================================" -echo "Config: $CONFIG" -echo "SAM2 checkpoint: $SAM2_CHECKPOINT" -echo "Device: $DEVICE" +echo "Config: $CONFIG" +echo "SAM2 checkpoint: $SAM2_CHECKPOINT" +echo "Device: $DEVICE" +echo "Stage 3 segmentation: $STAGE3_SEGMENTATION_MODEL" echo "==========================================" # ── Stage 1: SAM2 Segmentation ───────────────────────────────────────────────── @@ -59,7 +69,8 @@ python "$SCRIPT_DIR/stage2_vlm_analysis.py" \ echo "" echo "[3/4] Generating grey masks..." python "$SCRIPT_DIR/stage3a_generate_grey_masks_v2.py" \ - --config "$CONFIG" + --config "$CONFIG" \ + --segmentation-model "$STAGE3_SEGMENTATION_MODEL" # ── Stage 4: Combine into Quadmask ──────────────────────────────────────────── echo "" From 78caac4d0d8e3090cfbce24a66cec829d6a24dd3 Mon Sep 17 00:00:00 2001 From: Tharen Candi Date: Wed, 15 Apr 2026 18:00:09 +0200 Subject: [PATCH 2/3] fixed documentation for sam3 and langSAM --- VLM-MASK-REASONER/README.md | 32 +++++++++++++------ .../stage3a_generate_grey_masks_v2.py | 7 ---- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/VLM-MASK-REASONER/README.md b/VLM-MASK-REASONER/README.md index 169cadb..3a6cb3b 100644 --- a/VLM-MASK-REASONER/README.md +++ b/VLM-MASK-REASONER/README.md @@ -102,25 +102,39 @@ wget https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.p Stage 3a uses text-prompted segmentation to identify affected objects. You need **either** SAM3 or LangSAM: -**Option A: SAM3** +**Option A: SAM3 (default, recommended)** -```bash -pip install git+https://github.com/facebookresearch/segment-anything-3.git -``` +SAM3 requires Python 3.12+, PyTorch 2.7+, and HuggingFace authentication: -Download the SAM3 checkpoint: ```bash -# Check SAM3 repo for latest checkpoint URLs -wget https://dl.fbaipublicfiles.com/segment_anything_3/sam3_hiera_large.pt +# Install SAM3 +git clone https://github.com/facebookresearch/sam3.git +cd sam3 +pip install -e . +cd .. ``` +**Checkpoint access:** SAM3 checkpoints are hosted on HuggingFace and require authentication: + +1. Request access at https://huggingface.co/facebook/sam3.1 +2. Once approved, authenticate: + ```bash + pip install -U "huggingface_hub[cli]" + huggingface-cli login # Enter your HF token + ``` +3. The model will auto-download checkpoints on first use + +See the [SAM3 repo](https://github.com/facebookresearch/sam3) for full installation details. + **Option B: LangSAM (alternative)** +LangSAM combines SAM 2.1 with GroundingDINO for text-prompted segmentation. Requires Python 3.10+: + ```bash -pip install lang-sam +pip install -U git+https://github.com/luca-medeiros/lang-segment-anything.git ``` -To use LangSAM instead of SAM3, pass `--stage3-segmentation-model langsam` to `run_pipeline.sh`. +LangSAM auto-downloads its checkpoints (GroundingDINO + SAM 2.1) and doesn't require authentication. To use it, pass `--stage3-segmentation-model langsam` to `run_pipeline.sh`. If you place the checkpoint elsewhere, pass it explicitly: diff --git a/VLM-MASK-REASONER/stage3a_generate_grey_masks_v2.py b/VLM-MASK-REASONER/stage3a_generate_grey_masks_v2.py index c41a847..e710087 100644 --- a/VLM-MASK-REASONER/stage3a_generate_grey_masks_v2.py +++ b/VLM-MASK-REASONER/stage3a_generate_grey_masks_v2.py @@ -45,13 +45,6 @@ from PIL import Image import subprocess -# SAM2 for video tracking -try: - from sam2.build_sam import build_sam2_video_predictor - SAM2_AVAILABLE = True -except ImportError: - SAM2_AVAILABLE = False - # SAM3 for single-frame segmentation try: from sam3.model_builder import build_sam3_image_model From d232e839f1e8820532a8c7733680345f28801dcb Mon Sep 17 00:00:00 2001 From: Tharen Candi Date: Wed, 15 Apr 2026 19:03:02 +0200 Subject: [PATCH 3/3] notebook to demo the VLM-MASK-REASONER --- .../notebook-vlm-mask-reasoner.ipynb | 561 ++++++++++++++++++ 1 file changed, 561 insertions(+) create mode 100644 VLM-MASK-REASONER/notebook-vlm-mask-reasoner.ipynb diff --git a/VLM-MASK-REASONER/notebook-vlm-mask-reasoner.ipynb b/VLM-MASK-REASONER/notebook-vlm-mask-reasoner.ipynb new file mode 100644 index 0000000..f2fd2c4 --- /dev/null +++ b/VLM-MASK-REASONER/notebook-vlm-mask-reasoner.ipynb @@ -0,0 +1,561 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\n", + "# VLM-MASK-REASONER for VOID: Quadmask Generation Pipeline\n", + "\n", + "**Automatic interaction-aware mask generation using SAM2, Gemini VLM, and LangSAM**\n", + "\n", + "[Project Page](https://void-model.github.io/) | [Paper](https://arxiv.org/abs/2604.02296) | [HuggingFace](https://huggingface.co/netflix/void-model)\n", + "\n", + "
\n", + "\n", + "This notebook demonstrates the **VLM-MASK-REASONER** pipeline that generates quadmasks for VOID video inpainting. The pipeline:\n", + "\n", + "1. **Stage 1:** Uses SAM2 to segment the primary object (what you want to remove)\n", + "2. **Stage 2:** Uses Gemini VLM to identify affected objects (shadows, reflections, interactions)\n", + "3. **Stage 3:** Uses LangSAM to segment affected objects\n", + "4. **Stage 4:** Combines everything into a quadmask with 4 semantic values (0, 63, 127, 255)\n", + "\n", + "**Requirements:** \n", + "- GPU runtime (T4 or better recommended)\n", + "- Gemini API key (free at https://aistudio.google.com/app/apikey)\n", + "\n", + "**Note:** This notebook uses **LangSAM** for Stage 3 instead of SAM3 for Python 3.10 compatibility on Colab." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Setup & Installation\n", + "\n", + "Install all required dependencies including SAM2, LangSAM, and download checkpoints." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Clone the repo (skip if already cloned)\n", + "!git clone git@github.com:tharencandi/void-model.git 2>/dev/null || echo \"Repo already exists\"\n", + "!git checkout feat/stage3-segmentation-model-flag\n", + "%cd /content/void-model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install main requirements\n", + "!pip install -q opencv-python-headless numpy pillow tqdm google-generativeai\n", + "\n", + "# Install SAM2 for Stage 1\n", + "!pip install -q git+https://github.com/facebookresearch/segment-anything-2.git\n", + "\n", + "# Install LangSAM for Stage 3 (Python 3.10 compatible)\n", + "!pip install -q -U git+https://github.com/luca-medeiros/lang-segment-anything.git\n", + "\n", + "print(\"✓ All dependencies installed\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Download SAM2 Checkpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "sam2_checkpoint = Path(\"sam2_hiera_large.pt\")\n", + "\n", + "if not sam2_checkpoint.exists():\n", + " print(\"Downloading SAM2 checkpoint (2.4GB)...\")\n", + " !wget -q https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt\n", + " print(\"✓ SAM2 checkpoint downloaded\")\n", + "else:\n", + " print(\"✓ SAM2 checkpoint already exists\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Set Gemini API Key\n", + "\n", + "Get API key at: https://aistudio.google.com/app/apikey" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from google.colab import userdata\n", + "\n", + "try:\n", + " # Get your Gemini API key from Colab secrets\n", + " gemini_api_key = userdata.get('GEMINI_API_KEY')\n", + " os.environ['GEMINI_API_KEY'] = gemini_api_key\n", + " print(\"✓ Gemini API key loaded from Colab secrets.\")\n", + "except userdata.SecretNotFoundError:\n", + " print(\"❌ GEMINI_API_KEY not found in Colab secrets. Please add it to Colab secrets or set it manually.\")\n", + "except Exception as e:\n", + " print(f\"An error occurred: {e}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Prepare Sample Video & Config\n", + "\n", + "We'll use the \"lime\" sample - a glass being removed, causing the lime to fall.\n", + "\n", + "The pipeline requires a `config_points.json` file with point coordinates. You have **two options**:\n", + "\n", + "**Option A:** Use the interactive Point Selector GUI locally (recommended for precision) \n", + "**Option B:** Manually create the config (good for quick testing)\n", + "\n", + "---\n", + "\n", + "### Understanding config_points.json Format\n", + "\n", + "The config file specifies:\n", + "- **video_path**: Path to your input video\n", + "- **output_dir**: Where to save generated masks\n", + "- **instruction**: Text description of what to remove\n", + "- **primary_points**: Click coordinates on the object to segment (for SAM2)\n", + " - `frame_idx`: Which frame to click on (0 = first frame)\n", + " - `points`: List of [x, y] coordinates (click on the object)\n", + " - `labels`: 1 for positive click (include), 0 for negative (exclude)\n", + "- **min_grid**: Grid size for gridification (8 is standard)\n", + "\n", + "**Example config_points.json:**\n", + "```json\n", + "{\n", + " \"videos\": [\n", + " {\n", + " \"video_path\": \"/path/to/video.mp4\",\n", + " \"output_dir\": \"/path/to/output\",\n", + " \"instruction\": \"remove the glass\",\n", + " \"primary_points\": [\n", + " {\n", + " \"frame_idx\": 0,\n", + " \"points\": [[336, 240]],\n", + " \"labels\": [1]\n", + " }\n", + " ],\n", + " \"min_grid\": 8\n", + " }\n", + " ]\n", + "}\n", + "```\n", + "\n", + "You can add multiple videos to the `videos` array, and add points on multiple frames if the object moves significantly." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Option A: Use Point Selector GUI Locally (Recommended)\n", + "\n", + "For precise point selection, you can run the interactive GUI tool locally on your machine:\n", + "\n", + "**Step 1:** Create a simple config file (without points):\n", + "```json\n", + "{\n", + " \"videos\": [\n", + " {\n", + " \"video_path\": \"/path/to/your/video.mp4\",\n", + " \"output_dir\": \"/path/to/output\",\n", + " \"instruction\": \"remove the glass\"\n", + " }\n", + " ]\n", + "}\n", + "```\n", + "\n", + "**Step 2:** Run the point selector GUI:\n", + "```bash\n", + "python VLM-MASK-REASONER/point_selector_gui.py --config my_config.json\n", + "```\n", + "\n", + "**Step 3:** Use the GUI to:\n", + "- Navigate frames with the slider or arrow keys\n", + "- Click on the object you want to remove (multiple clicks for better segmentation)\n", + "- Add points on multiple frames if the object moves significantly\n", + "- Press \"Save All Points\" or Space to save\n", + "\n", + "This creates `my_config_points.json` with exact coordinates.\n", + "\n", + "**Step 4:** Upload the `_points.json` file to Colab and use it in the pipeline.\n", + "\n", + "---\n", + "\n", + "### Option B: Manual Config Creation (Quick Start)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Demo: Using Manual Config Creation (Option B)\n", + "\n", + "For this demo, we'll manually create the config. The code below creates a working config for the lime sample with pre-selected coordinates." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from pathlib import Path\n", + "\n", + "# Create config for the lime sample video with actual coordinates\n", + "# To use your own video: replace video_path, adjust points coordinates\n", + "config_data = {\n", + " \"videos\": [\n", + " {\n", + " \"video_path\": \"sample/lime/input_video.mp4\", # Path to video\n", + " \"output_dir\": \"sample/lime/masks_output\", # Output directory\n", + " \"instruction\": \"remove the glass\", # What to remove\n", + " \"primary_points_by_frame\": {\n", + " \"0\": [\n", + " [2150, 1243],\n", + " [2702, 1276],\n", + " [2452, 1886],\n", + " [2448, 1507],\n", + " [2424, 1104]\n", + " ]\n", + " }, # Frame 0: click on glass at [x, y]\n", + " \"primary_frames\": [\n", + " 0\n", + " ], # Frames with points\n", + " \"first_appears_frame\": 0, # First frame where object appears\n", + " \"primary_points\": [\n", + " [2150, 1243],\n", + " [2702, 1276],\n", + " [2452, 1886],\n", + " [2448, 1507],\n", + " [2424, 1104]\n", + " ], # Flattened points (for backward compatibility)\n", + " \"min_grid\": 8 # Grid size for VLM analysis\n", + " }\n", + " ]\n", + "}\n", + "\n", + "# Save config file\n", + "config_path = Path(\"config_points.json\")\n", + "with open(config_path, 'w') as f:\n", + " json.dump(config_data, f, indent=2)\n", + "\n", + "print(\"\\n💡 For precise point selection, use the GUI tool from Option A!\")\n", + "\n", + "output_dir = Path(config_data[\"videos\"][0][\"output_dir\"])print(\" 5. For multi-frame: '0': [[x1,y1]], '30': [[x2,y2]]\")\n", + "\n", + "output_dir.mkdir(parents=True, exist_ok=True)print(\" 4. Add more points: '0': [[x1,y1], [x2,y2], ...]\")\n", + "\n", + "print(\" 3. Update coordinates in 'primary_points_by_frame'\")\n", + "\n", + "print(f\"✓ Config created: {config_path}\")print(\" 2. Edit config_data above - update video_path and output_dir\")\n", + "\n", + "print(f\"✓ Output directory: {output_dir}\")print(\" 1. Upload your video file to Colab\")\n", + "\n", + "print(f\"✓ Using point coordinates: {config_data['videos'][0]['primary_points']}\")print(\"\\n📝 To use your own video:\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Run the Pipeline\n", + "\n", + "You have **two options** to run the 4-stage pipeline:\n", + "\n", + "**Option A:** Run all stages at once with `run_pipeline.sh` (recommended - single command) \n", + "**Option B:** Run each stage individually (good for debugging or customization)\n", + "\n", + "---\n", + "\n", + "### Option A: Run All Stages at Once (Recommended)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run the complete pipeline with a single command\n", + "# This runs all 4 stages automatically: SAM2 → Gemini VLM → LangSAM → Combine\n", + "\n", + "!bash VLM-MASK-REASONER/run_pipeline.sh config_points.json \\\n", + " --sam2-checkpoint sam2_hiera_large.pt \\\n", + " --stage3-segmentation-model langsam \\\n", + " --device cuda\n", + "\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"✓ Complete Pipeline Finished!\")\n", + "print(\"=\" * 70)\n", + "print(\"\\nGenerated files in output directory:\")\n", + "print(\" - black_mask.mp4 (Stage 1)\")\n", + "print(\" - vlm_analysis.json (Stage 2)\")\n", + "print(\" - grey_mask.mp4 (Stage 3)\")\n", + "print(\" - quadmask_0.mp4 (Stage 4)\")\n", + "print(\"\\nContinue to Section 6 to visualize results.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Option B: Run Stages Individually\n", + "\n", + "Run each stage separately if you want to inspect intermediate results or debug issues.\n", + "\n", + "**Stage 1:** SAM2 Segmentation (primary object) \n", + "**Stage 2:** Gemini VLM Analysis (identify affected objects) \n", + "**Stage 3:** LangSAM Segmentation (affected objects) \n", + "**Stage 4:** Combine into Quadmask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run Stage 1: SAM2 Segmentation\n", + "print(\"=\" * 70)\n", + "print(\"Stage 1: SAM2 Segmentation\")\n", + "print(\"=\" * 70)\n", + "\n", + "!python VLM-MASK-REASONER/stage1_sam2_segmentation.py \\\n", + " --config config_points.json \\\n", + " --sam2-checkpoint sam2_hiera_large.pt \\\n", + " --device cuda\n", + "\n", + "print(\"\\n✓ Stage 1 complete: black_mask.mp4 generated\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run Stage 2: VLM Analysis with Gemini\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"Stage 2: VLM Analysis (Gemini)\")\n", + "print(\"=\" * 70)\n", + "\n", + "!python VLM-MASK-REASONER/stage2_vlm_analysis.py \\\n", + " --config config_points.json\n", + "\n", + "print(\"\\n✓ Stage 2 complete: vlm_analysis.json generated\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run Stage 3: Grey Mask Generation with LangSAM\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"Stage 3: Generate Grey Masks (LangSAM)\")\n", + "print(\"=\" * 70)\n", + "\n", + "!python VLM-MASK-REASONER/stage3a_generate_grey_masks_v2.py \\\n", + " --config config_points.json \\\n", + " --segmentation-model langsam\n", + "\n", + "print(\"\\n✓ Stage 3 complete: grey_mask.mp4 generated\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run Stage 4: Combine into Quadmask\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"Stage 4: Combine Masks into Quadmask\")\n", + "print(\"=\" * 70)\n", + "\n", + "!python VLM-MASK-REASONER/stage4_combine_masks.py \\\n", + " --config config_points.json\n", + "\n", + "print(\"\\n✓ Stage 4 complete: quadmask_0.mp4 generated\")\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"Pipeline Complete!\")\n", + "print(\"=\" * 70)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Visualize Results\n", + "\n", + "Display the generated masks and VLM analysis to verify the pipeline output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import cv2\n", + "import numpy as np\n", + "from IPython.display import Image, display\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Read first frame of each video\n", + "def read_first_frame(video_path):\n", + " cap = cv2.VideoCapture(str(video_path))\n", + " ret, frame = cap.read()\n", + " cap.release()\n", + " if ret:\n", + " return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n", + " return None\n", + "\n", + "# Load frames\n", + "input_frame = read_first_frame(output_dir / \"input_video.mp4\")\n", + "black_mask = read_first_frame(output_dir / \"black_mask.mp4\")\n", + "grey_mask = read_first_frame(output_dir / \"grey_mask.mp4\")\n", + "quadmask = read_first_frame(output_dir / \"quadmask_0.mp4\")\n", + "\n", + "# Create visualization\n", + "fig, axes = plt.subplots(2, 2, figsize=(14, 14))\n", + "\n", + "axes[0, 0].imshow(input_frame)\n", + "axes[0, 0].set_title(\"Input Video (Frame 0)\", fontsize=14, fontweight='bold')\n", + "axes[0, 0].axis('off')\n", + "\n", + "axes[0, 1].imshow(black_mask, cmap='gray')\n", + "axes[0, 1].set_title(\"Black Mask (Primary Object)\\n0=remove, 255=keep\", fontsize=14, fontweight='bold')\n", + "axes[0, 1].axis('off')\n", + "\n", + "axes[1, 0].imshow(grey_mask, cmap='gray')\n", + "axes[1, 0].set_title(\"Grey Mask (Affected Objects)\\n127=affected, 255=background\", fontsize=14, fontweight='bold')\n", + "axes[1, 0].axis('off')\n", + "\n", + "axes[1, 1].imshow(quadmask)\n", + "axes[1, 1].set_title(\"Final Quadmask\\n0=primary, 63=overlap, 127=affected, 255=bg\", fontsize=14, fontweight='bold')\n", + "axes[1, 1].axis('off')\n", + "\n", + "plt.tight_layout()\n", + "plt.savefig(output_dir / \"pipeline_results.png\", dpi=150, bbox_inches='tight')\n", + "plt.show()\n", + "\n", + "print(\"✓ Visualization saved to:\", output_dir / \"pipeline_results.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show VLM analysis results\n", + "import json\n", + "\n", + "vlm_analysis_path = output_dir / \"vlm_analysis.json\"\n", + "if vlm_analysis_path.exists():\n", + " print(\"=\" * 70)\n", + " print(\"Gemini VLM Analysis Results\")\n", + " print(\"=\" * 70)\n", + " \n", + " with open(vlm_analysis_path, 'r') as f:\n", + " analysis = json.load(f)\n", + " \n", + " print(f\"\\nPrimary Object: {analysis.get('primary_object', 'N/A')}\")\n", + " print(f\"Scene Description: {analysis.get('scene_description', 'N/A')}\")\n", + " \n", + " affected = analysis.get('affected_objects', [])\n", + " print(f\"\\nAffected Objects ({len(affected)}):\")\n", + " for i, obj in enumerate(affected, 1):\n", + " print(f\" {i}. {obj.get('noun', 'N/A')} - {obj.get('reasoning', 'N/A')}\")\n", + " \n", + " print(\"\\n\" + \"=\" * 70)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Download Results\n", + "\n", + "Download the quadmask and use it with VOID Pass 1 inference.\n", + "\n", + "**Quadmask Values:**\n", + "- **0 (black):** Primary object to remove\n", + "- **63:** Overlap region\n", + "- **127 (grey):** Affected objects\n", + "- **255 (white):** Background\n", + "\n", + "**Try other samples:** Change coordinates to `\"moving_ball\"` or `\"pillow\"` sample videos\n", + "\n", + "**Use your own video:** Upload a video, modify the config with correct path and click points, re-run from Section 5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from google.colab import files\n", + "\n", + "# Download the quadmask\n", + "quadmask_path = output_dir / \"quadmask_0.mp4\"\n", + "if quadmask_path.exists():\n", + " print(\"Downloading quadmask_0.mp4...\")\n", + " files.download(str(quadmask_path))\n", + " print(\"✓ Download complete!\")\n", + "else:\n", + " print(\"❌ Quadmask not found. Check pipeline output above for errors.\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}