From de2bfcfd9b04f349b8ededd58ed2ff7681b3a039 Mon Sep 17 00:00:00 2001
From: antdev <237216263+yumesha@users.noreply.github.com>
Date: Mon, 16 Feb 2026 20:38:46 +0800
Subject: [PATCH] feat(ui): add video clipboard paste support for shell and
 print mode

- Add ClipboardVideo class and grab_video_from_clipboard() in clipboard.py

- Add video support to AttachmentCache with store_video_reference()

- Add _try_paste_video() method in prompt.py for Ctrl-V handling

- Add _extract_video_paths() and _build_content_parts() in print/__init__.py

- Export VIDEO_EXTENSIONS constant from file utils

- Update documentation for video clipboard paste

- Add tests for video attachment handling
---
 docs/en/guides/interaction.md     |  22 ++++-
 docs/zh/guides/interaction.md     |  22 ++++-
 src/kimi_cli/tools/file/utils.py  |   3 +
 src/kimi_cli/ui/print/__init__.py |  94 +++++++++++++++++++-
 src/kimi_cli/ui/shell/prompt.py   |  87 ++++++++++++++++++-
 src/kimi_cli/utils/clipboard.py   |  74 ++++++++++++++++
 tests/test_attachment_cache.py    |  59 +++++++++++++
 tests/test_print_video.py         | 139 ++++++++++++++++++++++++++++++
 8 files changed, 489 insertions(+), 11 deletions(-)
 create mode 100644 tests/test_print_video.py

diff --git a/docs/en/guides/interaction.md b/docs/en/guides/interaction.md
index 432af4390..ff325ae76 100644
--- a/docs/en/guides/interaction.md
+++ b/docs/en/guides/interaction.md
@@ -45,14 +45,30 @@ Sometimes you need to enter multiple lines, such as pasting a code snippet or er
 
 After finishing your input, press `Enter` to send the complete message.
 
-## Clipboard and image paste
+## Clipboard and media paste
 
-Press `Ctrl-V` to paste text or images from the clipboard.
+Press `Ctrl-V` to paste text, images, or video files from the clipboard.
 
-If the clipboard contains an image, Kimi Code CLI will automatically add the image as an attachment to the message. After sending the message, the AI can see and analyze the image.
+If the clipboard contains an **image**, Kimi Code CLI will automatically add the image as an attachment to the message. After sending the message, the AI can see and analyze the image.
+
+If the clipboard contains a **video file path**, Kimi Code CLI will insert a reference to the video file. The AI can then use the `ReadMediaFile` tool to read and analyze the video content.
+
+Supported video formats include: MP4, MKV, AVI, MOV, WMV, WebM, M4V, FLV, 3GP, and 3G2.
+
+### Video input in print mode (non-interactive)
+
+When using [print mode](../customization/print-mode.md) with `-c` or `--command`, you can reference video files directly:
+
+```sh
+kimi --print -c "Analyze this video /path/to/video.mp4"
+```
+
+Kimi Code CLI will automatically detect video file paths in your command and make them available to the AI for analysis.
 
 ::: tip
 Image input requires the model to support the `image_in` capability. Video input requires the `video_in` capability.
+
+Models like [Kimi K2.5](https://huggingface.co/moonshotai/Kimi-K2.5) support video understanding with strong performance on benchmarks like VideoMMMU (86.6), VideoMME (87.4), and LongVideoBench (79.8).
 :::
 
 ## Slash commands
diff --git a/docs/zh/guides/interaction.md b/docs/zh/guides/interaction.md
index af1e8f007..6e50cae45 100644
--- a/docs/zh/guides/interaction.md
+++ b/docs/zh/guides/interaction.md
@@ -45,14 +45,30 @@ Thinking 模式需要当前模型支持。部分模型（如 `kimi-k2-thinking-t
 
 输入完成后，按 `Enter` 发送整条消息。
 
-## 剪贴板与图片粘贴
+## 剪贴板与媒体粘贴
 
-按 `Ctrl-V` 可以粘贴剪贴板中的文本或图片。
+按 `Ctrl-V` 可以粘贴剪贴板中的文本、图片或视频文件。
 
-如果剪贴板中是图片，Kimi Code CLI 会自动将图片作为附件添加到消息中。发送消息后，AI 可以看到并分析这张图片。
+如果剪贴板中是**图片**，Kimi Code CLI 会自动将图片作为附件添加到消息中。发送消息后，AI 可以看到并分析这张图片。
+
+如果剪贴板中是**视频文件路径**，Kimi Code CLI 会插入该视频文件的引用。AI 随后可以使用 `ReadMediaFile` 工具读取和分析视频内容。
+
+支持的视频格式包括：MP4、MKV、AVI、MOV、WMV、WebM、M4V、FLV、3GP 和 3G2。
+
+### Print 模式（非交互式）中的视频输入
+
+在使用 [Print 模式](../customization/print-mode.md) 配合 `-c` 或 `--command` 时，你可以直接引用视频文件：
+
+```sh
+kimi --print -c "分析这个视频 /path/to/video.mp4"
+```
+
+Kimi Code CLI 会自动检测命令中的视频文件路径，并让 AI 进行分析。
 
 ::: tip 提示
 图片输入需要当前模型支持 `image_in` 能力，视频输入需要支持 `video_in` 能力。
+
+像 [Kimi K2.5](https://huggingface.co/moonshotai/Kimi-K2.5) 这样的模型支持视频理解，在 VideoMMMU (86.6)、VideoMME (87.4) 和 LongVideoBench (79.8) 等基准测试中表现优异。
 :::
 
 ## 斜杠命令
diff --git a/src/kimi_cli/tools/file/utils.py b/src/kimi_cli/tools/file/utils.py
index d674f8989..2f58e6e04 100644
--- a/src/kimi_cli/tools/file/utils.py
+++ b/src/kimi_cli/tools/file/utils.py
@@ -52,6 +52,9 @@
     ".3gp": "video/3gpp",
     ".3g2": "video/3gpp2",
 }
+
+# Public export for video extensions mapping
+VIDEO_EXTENSIONS = _VIDEO_MIME_BY_SUFFIX
 _TEXT_MIME_BY_SUFFIX = {
     ".svg": "image/svg+xml",
 }
diff --git a/src/kimi_cli/ui/print/__init__.py b/src/kimi_cli/ui/print/__init__.py
index b48466bfc..501740b87 100644
--- a/src/kimi_cli/ui/print/__init__.py
+++ b/src/kimi_cli/ui/print/__init__.py
@@ -2,12 +2,13 @@
 
 import asyncio
 import json
+import re
 import sys
 from functools import partial
 from pathlib import Path
 
 from kosong.chat_provider import ChatProviderError
-from kosong.message import Message
+from kosong.message import ContentPart, Message, TextPart
 from rich import print
 
 from kimi_cli.cli import InputFormat, OutputFormat
@@ -20,10 +21,95 @@
     run_soul,
 )
 from kimi_cli.soul.kimisoul import KimiSoul
+from kimi_cli.tools.file.utils import VIDEO_EXTENSIONS
 from kimi_cli.ui.print.visualize import visualize
 from kimi_cli.utils.logging import logger
 from kimi_cli.utils.signals import install_sigint_handler
 
+def _extract_video_paths(text: str) -> list[tuple[int, int, Path]]:
+    """Extract video file paths from text.
+    
+    Returns list of (start, end, path) tuples for each video file found.
+    Only includes paths that actually exist as files.
+    Handles paths with spaces and special characters in filenames by trying
+    progressively longer paths from the extension backwards.
+    """
+    results: list[tuple[int, int, Path]] = []
+    video_exts = "|".join(ext.lstrip(".") for ext in VIDEO_EXTENSIONS.keys())
+    
+    # Find all video extension occurrences (not using \b to avoid issues with [ or other chars)
+    # Match extensions followed by space, punctuation, or end of string
+    for match in re.finditer(rf"\.({video_exts})(?=\s|$|[.,;!?])", text, re.IGNORECASE):
+        ext_end = match.end()
+        
+        # Try progressively longer paths from the extension backwards
+        # Start from the beginning of the text and expand until we find a valid file
+        best_match: tuple[int, Path] | None = None
+        
+        # Try each possible start position, preferring longer paths
+        for start_candidate in range(0, ext_end):
+            # Must start at word boundary or with @ or /
+            if start_candidate > 0 and text[start_candidate - 1] not in " \t\n":
+                continue
+                
+            path_str = text[start_candidate:ext_end]
+            
+            # Remove @ prefix for validation
+            check_path_str = path_str[1:] if path_str.startswith("@") else path_str
+            path = Path(check_path_str)
+            
+            # Check if this is a valid video file
+            if path.suffix.lower() in VIDEO_EXTENSIONS and path.is_file():
+                # Found a valid file - update best match (preferring longer paths)
+                best_match = (start_candidate, path)
+        
+        if best_match is not None:
+            start_pos, path = best_match
+            results.append((start_pos, ext_end, path))
+    
+    return results
+
+
+def _build_content_parts(command: str) -> list[ContentPart]:
+    """Build content parts from command, detecting video files.
+    
+    Similar to the web UI, video files are wrapped in <video> tags
+    so the agent can use ReadMediaFile tool to read them.
+    """
+    video_paths = _extract_video_paths(command)
+    if not video_paths:
+        # No videos found, return simple text
+        return [TextPart(text=command)]
+    
+    parts: list[ContentPart] = []
+    last_end: int = 0
+    
+    for start, end, path in video_paths:
+        # Add text before this video
+        if start > last_end:
+            text_before = command[last_end:start]
+            if text_before:
+                parts.append(TextPart(text=text_before))
+        
+        # Add video reference
+        file_path = str(path)
+        # Try to get mime type from extension
+        suffix = path.suffix.lower()
+        mime_type = VIDEO_EXTENSIONS.get(suffix, "video/mp4")
+        
+        parts.append(TextPart(text=f'<video path="{file_path}" content_type="{mime_type}">'))
+        parts.append(TextPart(text="</video>\n\n"))
+        
+        last_end = end
+    
+    # Add any remaining text after the last video
+    if last_end < len(command):
+        text_after = command[last_end:]
+        if text_after:
+            parts.append(TextPart(text=text_after))
+    
+    return parts
+
 
 class Print:
     """
@@ -79,11 +165,15 @@ def _handler():
 
                 if command:
                     logger.info("Running agent with command: {command}", command=command)
+                    
+                    # Build content parts, detecting video files
+                    content_parts = _build_content_parts(command)
+                    
                     if self.output_format == "text" and not self.final_only:
                         print(command)
                     await run_soul(
                         self.soul,
-                        command,
+                        content_parts,
                         partial(visualize, self.output_format, self.final_only),
                         cancel_event,
                         self.soul.wire_file if isinstance(self.soul, KimiSoul) else None,
diff --git a/src/kimi_cli/ui/shell/prompt.py b/src/kimi_cli/ui/shell/prompt.py
index f40559907..6d9b0e303 100644
--- a/src/kimi_cli/ui/shell/prompt.py
+++ b/src/kimi_cli/ui/shell/prompt.py
@@ -43,7 +43,12 @@
 from kimi_cli.share import get_share_dir
 from kimi_cli.soul import StatusSnapshot, format_context_status
 from kimi_cli.ui.shell.console import console
-from kimi_cli.utils.clipboard import grab_image_from_clipboard, is_clipboard_available
+from kimi_cli.utils.clipboard import (
+    ClipboardVideo,
+    grab_image_from_clipboard,
+    grab_video_from_clipboard,
+    is_clipboard_available,
+)
 from kimi_cli.utils.logging import logger
 from kimi_cli.utils.media_tags import wrap_media_part
 from kimi_cli.utils.slashcmd import SlashCommand
@@ -531,7 +536,7 @@ def _build_image_part(image_bytes: bytes, mime_type: str) -> ImageURLPart:
     )
 
 
-type CachedAttachmentKind = Literal["image"]
+type CachedAttachmentKind = Literal["image", "video"]
 
 
 @dataclass(slots=True)
@@ -544,8 +549,10 @@ class CachedAttachment:
 class AttachmentCache:
     def __init__(self, root: Path | None = None) -> None:
         self._root = root or Path("/tmp/kimi")
-        self._dir_map: dict[CachedAttachmentKind, str] = {"image": "images"}
+        self._dir_map: dict[CachedAttachmentKind, str] = {"image": "images", "video": "videos"}
         self._payload_map: dict[tuple[CachedAttachmentKind, str, str], CachedAttachment] = {}
+        # For video references, we store path references without copying
+        self._video_refs: dict[str, Path] = {}
 
     def _dir_for(self, kind: CachedAttachmentKind) -> Path:
         return self._root / self._dir_map[kind]
@@ -604,6 +611,34 @@ def store_image(self, image: Image.Image) -> CachedAttachment | None:
         image.save(png_bytes, format="PNG")
         return self.store_bytes("image", ".png", png_bytes.getvalue())
 
+    def store_video_reference(self, video: ClipboardVideo) -> CachedAttachment | None:
+        """Store a video file path reference (does not copy the file).
+
+        Videos are referenced by their original path rather than being copied to cache
+        to avoid unnecessary disk usage for potentially large files.
+        """
+        dir_path = self._ensure_dir("video")
+        if dir_path is None:
+            return None
+
+        # Create a reference file containing the original path
+        attachment_id = self._reserve_id(dir_path, ".ref")
+        ref_path = dir_path / attachment_id
+        try:
+            ref_path.write_text(str(video.path), encoding="utf-8")
+        except OSError as exc:
+            logger.warning(
+                "Failed to write video reference file: {file} ({error})",
+                file=ref_path,
+                error=exc,
+            )
+            return None
+
+        cached = CachedAttachment(kind="video", attachment_id=attachment_id, path=ref_path)
+        # Store the original video path for quick lookup
+        self._video_refs[attachment_id] = video.path
+        return cached
+
     def load_bytes(
         self, kind: CachedAttachmentKind, attachment_id: str
     ) -> tuple[Path, bytes] | None:
@@ -631,12 +666,31 @@ def load_content_parts(
             mime_type = _guess_image_mime(path)
             part = _build_image_part(image_bytes, mime_type)
             return wrap_media_part(part, tag="image", attrs={"path": str(path)})
+        if kind == "video":
+            # Get the original video path from the reference
+            video_path = self._video_refs.get(attachment_id)
+            if video_path is None:
+                # Try to read from the reference file
+                ref_path = self._dir_for("video") / attachment_id
+                if not ref_path.exists():
+                    return None
+                try:
+                    video_path = Path(ref_path.read_text(encoding="utf-8").strip())
+                    self._video_refs[attachment_id] = video_path
+                except (OSError, ValueError):
+                    return None
+            if not video_path.exists():
+                return None
+            # Return as text part with @ mention for the agent to read via ReadMediaFile
+            return [TextPart(text=f"@{video_path}")]
         return None
 
 
 def _parse_attachment_kind(raw_kind: str) -> CachedAttachmentKind | None:
     if raw_kind == "image":
         return "image"
+    if raw_kind == "video":
+        return "video"
     return None
 
 
@@ -734,8 +788,11 @@ def _(event: KeyPressEvent) -> None:
 
             @_kb.add("c-v", eager=True)
             def _(event: KeyPressEvent) -> None:
+                # Try to paste image first, then video, then fall back to text
                 if self._try_paste_image(event):
                     return
+                if self._try_paste_video(event):
+                    return
                 clipboard_data = event.app.clipboard.get_data()
                 event.current_buffer.paste_clipboard_data(clipboard_data)
 
@@ -863,6 +920,30 @@ def _try_paste_image(self, event: KeyPressEvent) -> bool:
         event.app.invalidate()
         return True
 
+    def _try_paste_video(self, event: KeyPressEvent) -> bool:
+        """Try to paste a video file from the clipboard. Return True if successful."""
+        video = grab_video_from_clipboard()
+        if video is None:
+            return False
+
+        if "video_in" not in self._model_capabilities:
+            console.print("[yellow]Video input is not supported by the selected LLM model[/yellow]")
+            return False
+
+        cached = self._attachment_cache.store_video_reference(video)
+        if cached is None:
+            return False
+        logger.debug(
+            "Pasted video from clipboard: {attachment_id}, {video_path}",
+            attachment_id=cached.attachment_id,
+            video_path=video.path,
+        )
+
+        placeholder = f"[video:{cached.attachment_id}]"
+        event.current_buffer.insert_text(placeholder)
+        event.app.invalidate()
+        return True
+
     async def prompt(self) -> UserInput:
         with patch_stdout(raw=True):
             command = str(await self._session.prompt_async()).strip()
diff --git a/src/kimi_cli/utils/clipboard.py b/src/kimi_cli/utils/clipboard.py
index 9e8f515f8..ebe31f4f6 100644
--- a/src/kimi_cli/utils/clipboard.py
+++ b/src/kimi_cli/utils/clipboard.py
@@ -4,12 +4,27 @@
 import os
 import sys
 from collections.abc import Iterable
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, cast
 
 import pyperclip
 from PIL import Image, ImageGrab
 
+# Video file extensions that are supported for clipboard paste
+VIDEO_EXTENSIONS = {
+    ".mp4",
+    ".mkv",
+    ".avi",
+    ".mov",
+    ".wmv",
+    ".webm",
+    ".m4v",
+    ".flv",
+    ".3gp",
+    ".3g2",
+}
+
 
 def is_clipboard_available() -> bool:
     """Check if the Pyperclip clipboard is available."""
@@ -35,6 +50,65 @@ def grab_image_from_clipboard() -> Image.Image | None:
     return _open_first_image(payload)
 
 
+@dataclass(frozen=True)
+class ClipboardVideo:
+    """Represents a video file from clipboard."""
+
+    path: Path
+
+
+def grab_video_from_clipboard() -> ClipboardVideo | None:
+    """Read a video file path from the clipboard if possible.
+
+    Returns the first video file found in the clipboard file paths.
+    """
+    # On macOS, try native file path reading first
+    if sys.platform == "darwin":
+        paths = _read_clipboard_file_paths_macos_native()
+        video = _find_first_video(paths)
+        if video is not None:
+            return video
+
+    # Try Windows/Linux - ImageGrab may return file paths
+    try:
+        payload = ImageGrab.grabclipboard()
+        if isinstance(payload, list):
+            paths: list[Path] = []
+            for p in payload:
+                try:
+                    paths.append(Path(p))
+                except (TypeError, ValueError):
+                    continue
+            return _find_first_video(paths)
+    except Exception:
+        pass
+
+    # Try parsing clipboard text as a file path
+    try:
+        text = pyperclip.paste()
+        if text:
+            path = Path(text.strip().strip('"\''))
+            if path.is_file() and _is_video_file(path):
+                return ClipboardVideo(path=path)
+    except Exception:
+        pass
+
+    return None
+
+
+def _is_video_file(path: Path) -> bool:
+    """Check if a file is a video based on extension."""
+    return path.suffix.lower() in VIDEO_EXTENSIONS
+
+
+def _find_first_video(paths: Iterable[Path]) -> ClipboardVideo | None:
+    """Find the first video file in a list of paths."""
+    for path in paths:
+        if path.is_file() and _is_video_file(path):
+            return ClipboardVideo(path=path)
+    return None
+
+
 def _open_first_image(paths: Iterable[os.PathLike[str] | str]) -> Image.Image | None:
     for item in paths:
         try:
diff --git a/tests/test_attachment_cache.py b/tests/test_attachment_cache.py
index 2a49476c8..a20e0ceb3 100644
--- a/tests/test_attachment_cache.py
+++ b/tests/test_attachment_cache.py
@@ -5,6 +5,7 @@
 from PIL import Image
 
 from kimi_cli.ui.shell.prompt import AttachmentCache, _parse_attachment_kind
+from kimi_cli.utils.clipboard import ClipboardVideo
 from kimi_cli.wire.types import ImageURLPart, TextPart
 
 
@@ -51,3 +52,61 @@ def test_attachment_cache_dedupes_bytes(tmp_path) -> None:
     assert cached_first.path == cached_second.path
     assert cached_first.path.read_bytes() == payload
     assert len(list((tmp_path / "images").iterdir())) == 1
+
+
+def test_parse_attachment_kind_video() -> None:
+    assert _parse_attachment_kind("video") == "video"
+    assert _parse_attachment_kind("unknown") is None
+
+
+def test_attachment_cache_video_reference(tmp_path) -> None:
+    cache = AttachmentCache(root=tmp_path)
+    video_path = tmp_path / "test_video.mp4"
+    video_path.write_text("fake video content")
+
+    video = ClipboardVideo(path=video_path)
+    cached = cache.store_video_reference(video)
+    assert cached is not None
+    assert cached.path.exists()
+    assert cached.path.parent == tmp_path / "videos"
+    assert cached.path.read_text() == str(video_path)
+
+
+def test_attachment_cache_video_load_parts(tmp_path) -> None:
+    cache = AttachmentCache(root=tmp_path)
+    video_path = tmp_path / "test_video.mp4"
+    video_path.write_text("fake video content")
+
+    video = ClipboardVideo(path=video_path)
+    cached = cache.store_video_reference(video)
+    assert cached is not None
+
+    parts = cache.load_content_parts("video", cached.attachment_id)
+    assert parts is not None
+    assert len(parts) == 1
+    assert isinstance(parts[0], TextPart)
+    assert parts[0].text == f"@{video_path}"
+
+
+def test_attachment_cache_video_missing_file(tmp_path) -> None:
+    cache = AttachmentCache(root=tmp_path)
+    # Try to load a non-existent video
+    parts = cache.load_content_parts("video", "nonexistent.ref")
+    assert parts is None
+
+
+def test_attachment_cache_video_removed_original(tmp_path) -> None:
+    cache = AttachmentCache(root=tmp_path)
+    video_path = tmp_path / "test_video.mp4"
+    video_path.write_text("fake video content")
+
+    video = ClipboardVideo(path=video_path)
+    cached = cache.store_video_reference(video)
+    assert cached is not None
+
+    # Remove the original video file
+    video_path.unlink()
+
+    # Should return None because the original file is gone
+    parts = cache.load_content_parts("video", cached.attachment_id)
+    assert parts is None
diff --git a/tests/test_print_video.py b/tests/test_print_video.py
new file mode 100644
index 000000000..cae782cc5
--- /dev/null
+++ b/tests/test_print_video.py
@@ -0,0 +1,139 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from kimi_cli.ui.print import _build_content_parts, _extract_video_paths
+from kimi_cli.wire.types import TextPart
+
+
+class TestExtractVideoPaths:
+    def test_no_video_paths(self, tmp_path):
+        text = "This is just some text without any video files"
+        result = _extract_video_paths(text)
+        assert result == []
+
+    def test_single_video_path(self, tmp_path):
+        video_file = tmp_path / "test_video.mp4"
+        video_file.write_text("fake video content")
+        
+        text = f"Please analyze {video_file} for me"
+        result = _extract_video_paths(text)
+        
+        assert len(result) == 1
+        start, end, path = result[0]
+        assert path == video_file
+        assert text[start:end] == str(video_file)
+
+    def test_multiple_video_paths(self, tmp_path):
+        video1 = tmp_path / "first.mkv"
+        video2 = tmp_path / "second.mov"
+        video1.write_text("fake content 1")
+        video2.write_text("fake content 2")
+        
+        text = f"Compare {video1} with {video2}"
+        result = _extract_video_paths(text)
+        
+        assert len(result) == 2
+        assert result[0][2] == video1
+        assert result[1][2] == video2
+
+    def test_video_path_with_at_mention(self, tmp_path):
+        video_file = tmp_path / "clip.webm"
+        video_file.write_text("fake video")
+        
+        text = f"Check out @{video_file}"
+        result = _extract_video_paths(text)
+        
+        assert len(result) == 1
+        assert result[0][2] == video_file
+
+    def test_nonexistent_video_file(self, tmp_path):
+        video_file = tmp_path / "does_not_exist.mp4"
+        
+        text = f"Analyze {video_file}"
+        result = _extract_video_paths(text)
+        
+        # Should not include files that don't exist
+        assert result == []
+
+    def test_non_video_file(self, tmp_path):
+        text_file = tmp_path / "readme.txt"
+        text_file.write_text("just text")
+        
+        text = f"Read {text_file}"
+        result = _extract_video_paths(text)
+        
+        # Should not include non-video files
+        assert result == []
+
+
+class TestBuildContentParts:
+    def test_plain_text_no_videos(self):
+        command = "Just a simple command"
+        parts = _build_content_parts(command)
+        
+        assert len(parts) == 1
+        assert isinstance(parts[0], TextPart)
+        assert parts[0].text == command
+
+    def test_single_video(self, tmp_path):
+        video_file = tmp_path / "test.avi"
+        video_file.write_text("fake video")
+        
+        command = f"Analyze this video: {video_file}"
+        parts = _build_content_parts(command)
+        
+        # Should have: text + video tag open + video tag close (no trailing text)
+        assert len(parts) == 3
+        assert parts[0].text == "Analyze this video: "
+        assert '<video path="' in parts[1].text
+        assert '</video>' in parts[2].text
+
+    def test_multiple_videos(self, tmp_path):
+        video1 = tmp_path / "a.mp4"
+        video2 = tmp_path / "b.mkv"
+        video1.write_text("v1")
+        video2.write_text("v2")
+        
+        command = f"Compare {video1} and {video2} please"
+        parts = _build_content_parts(command)
+        
+        # Should have text parts and video tags for both videos
+        # text + video1 open/close + text + video2 open/close + text = 7 parts
+        assert len(parts) == 7
+        assert "Compare " in parts[0].text
+        assert str(video1) in parts[1].text
+        assert " and " in parts[3].text
+        assert str(video2) in parts[4].text
+        assert " please" in parts[6].text
+
+    def test_video_with_mime_type(self, tmp_path):
+        # Test MKV file gets correct mime type
+        video_file = tmp_path / "movie.mkv"
+        video_file.write_text("fake mkv")
+        
+        command = str(video_file)
+        parts = _build_content_parts(command)
+        
+        assert len(parts) == 2
+        assert 'content_type="video/x-matroska"' in parts[0].text
+
+    def test_mp4_with_mime_type(self, tmp_path):
+        video_file = tmp_path / "clip.mp4"
+        video_file.write_text("fake mp4")
+        
+        command = str(video_file)
+        parts = _build_content_parts(command)
+        
+        assert 'content_type="video/mp4"' in parts[0].text
+
+    def test_all_supported_extensions(self, tmp_path):
+        extensions = [".mp4", ".mkv", ".avi", ".mov", ".wmv", ".webm", ".m4v", ".flv", ".3gp", ".3g2"]
+        
+        for ext in extensions:
+            video_file = tmp_path / f"test{ext}"
+            video_file.write_text("fake")
+            
+            result = _extract_video_paths(str(video_file))
+            assert len(result) == 1, f"Extension {ext} should be detected"
+            assert result[0][2].suffix.lower() == ext